diff --git a/CHANGELOG.md b/CHANGELOG.md
index ef7be25d24..75d00cd468 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -42,8 +42,8 @@ with the exception that minor releases may include breaking changes.
   [#1569], [#1570], [#1572], [#1573], [#1580], [#1602], [#1620], [#1623],
   [#1624], [#1626], [#1627], [#1635], [#1638], [#1673], [#1675], [#1700],
   [#1710], [#1717], [#1728], [#1730], [#1749], [#1751], [#1762], [#1765],
-  [#1774], [#1780], [#1781], [#1782], [#1787], [#1802], [#1806], [#1807],
-  [#1808]) ([**@burgholzer**], [**@denialhaag**], [**@taminob**],
+  [#1774], [#1780], [#1781], [#1782], [#1787], [#1802], [#1805], [#1806],
+  [#1807], [#1808]) ([**@burgholzer**], [**@denialhaag**], [**@taminob**],
   [**@DRovara**], [**@li-mingbao**], [**@Ectras**], [**@MatthiasReumann**],
   [**@simon1hofmann**])
 
@@ -601,6 +601,7 @@ changelogs._
 [#1808]: https://github.com/munich-quantum-toolkit/core/pull/1808
 [#1807]: https://github.com/munich-quantum-toolkit/core/pull/1807
 [#1806]: https://github.com/munich-quantum-toolkit/core/pull/1806
+[#1805]: https://github.com/munich-quantum-toolkit/core/pull/1805
 [#1802]: https://github.com/munich-quantum-toolkit/core/pull/1802
 [#1787]: https://github.com/munich-quantum-toolkit/core/pull/1787
 [#1782]: https://github.com/munich-quantum-toolkit/core/pull/1782
diff --git a/mlir/include/mlir/Dialect/QCO/Transforms/Mapping/Mapping.h b/mlir/include/mlir/Dialect/QCO/Transforms/Mapping/Mapping.h
index 0eead57bd9..544f1872c7 100644
--- a/mlir/include/mlir/Dialect/QCO/Transforms/Mapping/Mapping.h
+++ b/mlir/include/mlir/Dialect/QCO/Transforms/Mapping/Mapping.h
@@ -11,7 +11,6 @@
 #pragma once
 
 #include "mlir/Dialect/QCO/Transforms/Passes.h"
-#include "mlir/Dialect/QCO/Utils/Algorithms.h"
 
 #include <llvm/Support/LogicalResult.h>
 #include <mlir/IR/Region.h>
@@ -25,7 +24,8 @@ namespace mlir::qco {
  * @brief Create a mapping pass instance with the given target architecture.
  * @returns a pass object.
  */
-std::unique_ptr<Pass> createMappingPass(size_t nqubits, const Edges& coupling,
-                                        MappingPassOptions options);
+std::unique_ptr<Pass>
+createMappingPass(const llvm::DenseSet<std::pair<size_t, size_t>>&,
+                  MappingPassOptions);
 
 } // namespace mlir::qco
diff --git a/mlir/include/mlir/Dialect/QCO/Utils/Algorithms.h b/mlir/include/mlir/Dialect/QCO/Utils/Algorithms.h
deleted file mode 100644
index 77c57585f3..0000000000
--- a/mlir/include/mlir/Dialect/QCO/Utils/Algorithms.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2023 - 2026 Chair for Design Automation, TUM
- * Copyright (c) 2025 - 2026 Munich Quantum Software Company GmbH
- * All rights reserved.
- *
- * SPDX-License-Identifier: MIT
- *
- * Licensed under the MIT License
- */
-
-#pragma once
-
-#include <llvm/ADT/DenseSet.h>
-#include <mlir/Support/LLVM.h>
-
-#include <cstddef>
-#include <utility>
-
-namespace mlir::qco {
-
-using Matrix = SmallVector<SmallVector<size_t, 0>, 0>;
-using Edges = llvm::DenseSet<std::pair<size_t, size_t>>;
-
-/**
- * @brief Find all shortest paths between two nodes in a graph.
- * @details Has a time complexity of O(n^3).
- *
- * @link Adapted from https://en.wikipedia.org/wiki/Floyd–Warshall_algorithm
- *
- * @param n The number of nodes in the graph.
- * @param edges The set of edges (i, j).
- *
- * @returns The distance matrix dist, where dist[i, j] is defined as the
- * distance between node i and j.
- */
-Matrix findAllShortestPaths(size_t n, const Edges& edges);
-
-} // namespace mlir::qco
diff --git a/mlir/include/mlir/Dialect/QCO/Utils/Drivers.h b/mlir/include/mlir/Dialect/QCO/Utils/Drivers.h
index 3d360ebb78..f359e3194c 100644
--- a/mlir/include/mlir/Dialect/QCO/Utils/Drivers.h
+++ b/mlir/include/mlir/Dialect/QCO/Utils/Drivers.h
@@ -12,7 +12,6 @@
 
 #include "mlir/Dialect/QCO/IR/QCOInterfaces.h"
 #include "mlir/Dialect/QCO/IR/QCOOps.h"
-#include "mlir/Dialect/QCO/Utils/Qubits.h"
 #include "mlir/Dialect/QCO/Utils/WireIterator.h"
 #include "mlir/Dialect/QTensor/IR/QTensorOps.h"
 
@@ -35,63 +34,6 @@
 
 namespace mlir::qco {
 
-using WalkProgramFn = function_ref<WalkResult(Operation*, Qubits&)>;
-
-/**
- * @brief Perform top-down non-recursive walk of all operations within a
- * region of a quantum program and apply a callback function.
- * @details The signature of the callback function is:
- *
- *     (Operation*, Qubits& q) -> WalkResult
- *
- * where the Qubits object tracks the front of qubit SSA values.
- * Depending on the template parameter, the callback is executed before or after
- * updating the Qubits state.
- * @param region The targeted region.
- * @param fn The callback function.
- * @returns success(), if all operations have been visited.
- */
-template <WalkOrder Order = WalkOrder::PreOrder>
-LogicalResult walkProgram(Region& region, const WalkProgramFn& fn) {
-  Qubits qubits;
-  for (Operation& curr : region.getOps()) {
-    if constexpr (Order == WalkOrder::PreOrder) {
-      if (fn(&curr, qubits).wasInterrupted()) {
-        return failure();
-      }
-    }
-
-    TypeSwitch<Operation*>(&curr)
-        .template Case<StaticOp>(
-            [&](StaticOp op) { qubits.add(op.getQubit(), op.getIndex()); })
-        .template Case<AllocOp>([&](AllocOp op) { qubits.add(op.getResult()); })
-        .template Case<UnitaryOpInterface>([&](UnitaryOpInterface& op) {
-          for (const auto& [prevV, nextV] :
-               llvm::zip(op.getInputQubits(), op.getOutputQubits())) {
-            const auto prevQ = cast<TypedValue<QubitType>>(prevV);
-            const auto nextQ = cast<TypedValue<QubitType>>(nextV);
-            qubits.remap(prevQ, nextQ);
-          }
-        })
-        .template Case<ResetOp>([&](ResetOp op) {
-          qubits.remap(op.getQubitIn(), op.getQubitOut());
-        })
-        .template Case<MeasureOp>([&](MeasureOp op) {
-          qubits.remap(op.getQubitIn(), op.getQubitOut());
-        })
-        .template Case<SinkOp>(
-            [&](SinkOp op) { qubits.remove(op.getQubit()); });
-
-    if constexpr (Order == WalkOrder::PostOrder) {
-      if (fn(&curr, qubits).wasInterrupted()) {
-        return failure();
-      }
-    }
-  }
-
-  return success();
-}
-
 using ReleasedOps = SmallVector<Operation*, 8>;
 using PendingWiresMap = DenseMap<Operation*, SmallVector<size_t>>;
 
diff --git a/mlir/include/mlir/Dialect/QCO/Utils/Graph.h b/mlir/include/mlir/Dialect/QCO/Utils/Graph.h
new file mode 100644
index 0000000000..203a3590d7
--- /dev/null
+++ b/mlir/include/mlir/Dialect/QCO/Utils/Graph.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2023 - 2026 Chair for Design Automation, TUM
+ * Copyright (c) 2025 - 2026 Munich Quantum Software Company GmbH
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Licensed under the MIT License
+ */
+
+#pragma once
+
+#include <llvm/ADT/ArrayRef.h>
+#include <llvm/ADT/DenseSet.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/ADT/Twine.h>
+#include <llvm/Support/Debug.h>
+#include <mlir/Support/LLVM.h>
+
+#include <cassert>
+#include <cstddef>
+#include <optional>
+#include <utility>
+
+namespace mlir::qco {
+
+/// A directed graph.
+class Graph {
+public:
+  class DistanceMatrix {
+    SmallVector<size_t> data_;
+    size_t n_{};
+
+  public:
+    /// Initialize distance matrix, where all entries are filled with `v`.
+    explicit DistanceMatrix(size_t n, size_t v) : n_(n), data_(n * n, v) {}
+
+    /// Return the i-th row.
+    MutableArrayRef<size_t> operator[](size_t i) {
+      assert(i < n_ && "row index out of bounds");
+      return MutableArrayRef<size_t>(data_).slice(i * n_, n_);
+    }
+
+    /// Return the i-th row.
+    ArrayRef<size_t> operator[](size_t i) const {
+      assert(i < n_ && "row index out of bounds");
+      return ArrayRef<size_t>(data_).slice(i * n_, n_);
+    }
+  };
+
+  /// Construct an empty graph.
+  Graph() = default;
+
+  /// Construct graph from edge set.
+  explicit Graph(const llvm::DenseSet<std::pair<size_t, size_t>>& edges) {
+    for_each(edges, [this](const auto& e) { addEdge(e.first, e.second); });
+  }
+
+  /// Add a directed edge to the internal representation of the graph.
+  /// Implicitly adds nodes.
+  void addEdge(size_t u, size_t v);
+
+  /// Return the neighbours of a node.
+  [[nodiscard]] ArrayRef<size_t> getNeighbours(size_t id) const;
+
+  /// Return the nodes.
+  [[nodiscard]] SmallVector<size_t> getNodes() const;
+
+  /// Return the number of nodes.
+  [[nodiscard]] size_t getNumNodes() const { return adj_.size(); }
+
+  /// Return the degree of a node.
+  [[nodiscard]] size_t getDegree(size_t id) { return adj_.at(id).size(); }
+
+  /// Return the max degree of the graph.
+  [[nodiscard]] size_t getMaxDegree() const;
+
+  /// Return true if the graph has no nodes and edges.
+  [[nodiscard]] bool empty() const { return adj_.empty(); }
+
+  /// Clear the graph.
+  [[nodiscard]] void clear() { adj_.clear(); }
+
+  /// Return the minimum distance matrix of the graph by implementing the
+  /// Floyd-Warshall Algorithm
+  /// (https://en.wikipedia.org/wiki/Floyd–Warshall_algorithm) where dist[i][j]
+  /// denotes the distance between i and j.
+  [[nodiscard]] Graph::DistanceMatrix getDistMatrix() const;
+
+  /// Return cycle in graph or `std::nullopt` if none exists.
+  /// Implements an iterative depth-first search inspired by LLVM's SCC
+  /// utilities. For a cycle [A, B, C, A], the function returns [A, B, C].
+  [[nodiscard]] std::optional<SmallVector<size_t>> findCycle() const;
+
+private:
+  llvm::DenseMap<size_t, SmallVector<size_t>> adj_;
+};
+} // namespace mlir::qco
diff --git a/mlir/include/mlir/Dialect/QCO/Utils/Layout.h b/mlir/include/mlir/Dialect/QCO/Utils/Layout.h
new file mode 100644
index 0000000000..f4fc4c3530
--- /dev/null
+++ b/mlir/include/mlir/Dialect/QCO/Utils/Layout.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2023 - 2026 Chair for Design Automation, TUM
+ * Copyright (c) 2025 - 2026 Munich Quantum Software Company GmbH
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Licensed under the MIT License
+ */
+
+#pragma once
+
+#include <llvm/ADT/ArrayRef.h>
+#include <llvm/ADT/SmallVector.h>
+#include <mlir/Support/LLVM.h>
+
+#include <cstddef>
+#include <tuple>
+#include <type_traits>
+
+namespace mlir::qco {
+
+/// A qubit layout that maps program and hardware indices without
+/// storing Values. Used for efficient memory usage when Value tracking isn't
+/// needed.
+///
+/// Note that we use the terminology "hardware" and "program" qubits
+/// here, because "virtual" (opposed to physical) and "static" (opposed to
+/// dynamic) are C++ keywords.
+class Layout {
+public:
+  /// Construct and return a random layout with size `nqubits`.
+  static Layout random(size_t nqubits, size_t seed);
+
+  /// Insert program:hardware index mapping.
+  void add(size_t prog, size_t hw);
+
+  /// Lookup and return program index for a hardware index.
+  [[nodiscard]] size_t getProgramIndex(size_t hw) const;
+
+  /// Lookup and return hardware index for a program index.
+  [[nodiscard]] size_t getHardwareIndex(size_t prog) const;
+
+  /// Lookup and return multiple hardware indices at once.
+  template <typename... ProgIndices>
+    requires(sizeof...(ProgIndices) > 0) &&
+            ((std::is_convertible_v<ProgIndices, size_t>) && ...)
+  [[nodiscard]] auto getHardwareIndices(ProgIndices... progs) const {
+    return std::tuple{getHardwareIndex(static_cast<size_t>(progs))...};
+  }
+
+  /// Lookup and return multiple program indices at once.
+  template <typename... HwIndices>
+    requires(sizeof...(HwIndices) > 0) &&
+            ((std::is_convertible_v<HwIndices, size_t>) && ...)
+  [[nodiscard]] auto getProgramIndices(HwIndices... hws) const {
+    return std::tuple{getProgramIndex(static_cast<size_t>(hws))...};
+  }
+
+  /// Swap the mapping to program indices of two hardware indices.
+  void swap(size_t hwA, size_t hwB);
+
+  /// Return the number of qubits managed by the layout.
+  [[nodiscard]] size_t nqubits() const;
+
+  /// Return the program to hardware mapping.
+  [[nodiscard]] ArrayRef<size_t> getProgramToHardware() const;
+
+protected:
+  /// Maps a program qubit index to its hardware index.
+  SmallVector<size_t> programToHardware_;
+  /// Maps a hardware qubit index to its program index.
+  SmallVector<size_t> hardwareToProgram_;
+
+private:
+  explicit Layout(const size_t nqubits)
+      : programToHardware_(nqubits), hardwareToProgram_(nqubits) {}
+};
+} // namespace mlir::qco
diff --git a/mlir/include/mlir/Dialect/QCO/Utils/Qubits.h b/mlir/include/mlir/Dialect/QCO/Utils/Qubits.h
deleted file mode 100644
index b08952aa46..0000000000
--- a/mlir/include/mlir/Dialect/QCO/Utils/Qubits.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (c) 2023 - 2026 Chair for Design Automation, TUM
- * Copyright (c) 2025 - 2026 Munich Quantum Software Company GmbH
- * All rights reserved.
- *
- * SPDX-License-Identifier: MIT
- *
- * Licensed under the MIT License
- */
-
-#pragma once
-
-#include "mlir/Dialect/QCO/IR/QCODialect.h"
-
-#include <mlir/IR/Value.h>
-#include <mlir/Support/LLVM.h>
-
-#include <cstddef>
-#include <cstdint>
-#include <utility>
-
-namespace mlir::qco {
-class Qubits {
-  /**
-   * @brief Specifies the qubit "location" (hardware or program).
-   */
-  enum class QubitLocation : std::uint8_t { Hardware, Program };
-
-public:
-  /**
-   * @brief Add qubit with automatically assigned dynamic index.
-   */
-  void add(TypedValue<QubitType> q);
-
-  /**
-   * @brief Add qubit with static index.
-   */
-  void add(TypedValue<QubitType> q, std::size_t hw);
-
-  /**
-   * @brief Remap the qubit value from prev to next.
-   */
-  void remap(TypedValue<QubitType> prev, TypedValue<QubitType> next);
-
-  /**
-   * @brief Remove the qubit value.
-   */
-  void remove(TypedValue<QubitType> q);
-
-  /**
-   * @returns the qubit value assigned to a program index.
-   */
-  [[nodiscard]] TypedValue<QubitType> getProgramQubit(std::size_t index) const;
-
-  /**
-   * @returns the qubit value assigned to a hardware index.
-   */
-  [[nodiscard]] TypedValue<QubitType> getHardwareQubit(std::size_t index) const;
-
-  /**
-   * @returns the index assigned to the qubit value.
-   */
-  [[nodiscard]] std::size_t getIndex(TypedValue<QubitType> q) const;
-
-private:
-  DenseMap<std::size_t, TypedValue<QubitType>> programToValue_;
-  DenseMap<std::size_t, TypedValue<QubitType>> hardwareToValue_;
-  DenseMap<TypedValue<QubitType>, std::pair<QubitLocation, std::size_t>>
-      valueToIndex_;
-};
-} // namespace mlir::qco
diff --git a/mlir/lib/Dialect/QCO/Transforms/Mapping/Mapping.cpp b/mlir/lib/Dialect/QCO/Transforms/Mapping/Mapping.cpp
index 74a6d9e6db..708a1992c1 100644
--- a/mlir/lib/Dialect/QCO/Transforms/Mapping/Mapping.cpp
+++ b/mlir/lib/Dialect/QCO/Transforms/Mapping/Mapping.cpp
@@ -11,42 +11,49 @@
 #include "mlir/Dialect/QCO/Transforms/Mapping/Mapping.h"
 
 #include "mlir/Dialect/QCO/IR/QCODialect.h"
+#include "mlir/Dialect/QCO/IR/QCOInterfaces.h"
 #include "mlir/Dialect/QCO/IR/QCOOps.h"
-#include "mlir/Dialect/QCO/Utils/Algorithms.h"
 #include "mlir/Dialect/QCO/Utils/Drivers.h"
+#include "mlir/Dialect/QCO/Utils/Graph.h"
+#include "mlir/Dialect/QCO/Utils/Layout.h"
 #include "mlir/Dialect/QCO/Utils/WireIterator.h"
 #include "mlir/Dialect/QTensor/IR/QTensorOps.h"
 #include "mlir/Dialect/QTensor/Utils/TensorIterator.h"
 
-#include <llvm/ADT/ArrayRef.h>
 #include <llvm/ADT/PriorityQueue.h>
 #include <llvm/ADT/STLExtras.h>
 #include <llvm/ADT/SmallVector.h>
+#include <llvm/ADT/iterator_range.h>
 #include <llvm/Support/Allocator.h>
+#include <llvm/Support/Debug.h>
 #include <llvm/Support/ErrorHandling.h>
+#include <llvm/Support/LogicalResult.h>
 #include <mlir/Analysis/TopologicalSortUtils.h>
 #include <mlir/Dialect/Func/IR/FuncOps.h>
+#include <mlir/Dialect/SCF/IR/SCF.h>
 #include <mlir/IR/Block.h>
+#include <mlir/IR/Builders.h>
 #include <mlir/IR/BuiltinOps.h>
 #include <mlir/IR/Diagnostics.h>
+#include <mlir/IR/Dominance.h>
 #include <mlir/IR/Location.h>
 #include <mlir/IR/PatternMatch.h>
 #include <mlir/IR/Threading.h>
 #include <mlir/IR/Value.h>
+#include <mlir/IR/ValueRange.h>
 #include <mlir/Pass/Pass.h>
 #include <mlir/Support/LLVM.h>
 #include <mlir/Support/WalkResult.h>
 
 #include <algorithm>
+#include <array>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
 #include <iterator>
 #include <memory>
-#include <numeric>
 #include <random>
 #include <ranges>
-#include <string_view>
 #include <tuple>
 #include <utility>
 #include <vector>
@@ -64,235 +71,107 @@ namespace {
 
 struct MappingPass : impl::MappingPassBase<MappingPass> {
 private:
-  using IndexType = size_t;
-  using IndexPairType = std::pair<IndexType, IndexType>;
+  using IndexPairType = std::pair<size_t, size_t>;
   using Window = SmallVector<IndexPairType>;
-  using Neighbours = SmallVector<SmallVector<size_t, 4>>;
-
-  enum class RoutingMode : std::uint8_t { Cold, Hot };
-
-  /**
-   * @brief A qubit layout that maps program and hardware indices without
-   * storing Values. Used for efficient memory usage when Value tracking isn't
-   * needed.
-   *
-   * Note that we use the terminology "hardware" and "program" qubits here,
-   * because "virtual" (opposed to physical) and "static" (opposed to dynamic)
-   * are C++ keywords.
-   */
-  class [[nodiscard]] Layout {
-  public:
-    /**
-     * @brief Constructs the identity (i->i) layout.
-     * @param nqubits The number of qubits.
-     * @return The identity layout.
-     */
-    static Layout identity(const size_t nqubits) {
-      Layout layout(nqubits);
-      for (size_t i = 0; i < nqubits; ++i) {
-        layout.add(i, i);
-      }
-      return layout;
-    }
-
-    /**
-     * @brief Constructs a random layout.
-     * @param nqubits The number of qubits.
-     * @param seed A seed for randomization.
-     * @return The random layout.
-     */
-    static Layout random(const size_t nqubits, const size_t seed) {
-      SmallVector<IndexType> mapping(nqubits);
-      std::iota(mapping.begin(), mapping.end(), IndexType{0});
-      std::ranges::shuffle(mapping, std::mt19937_64{seed});
-
-      Layout layout(nqubits);
-      for (const auto [prog, hw] : enumerate(mapping)) {
-        layout.add(prog, hw);
-      }
+  using Wires = SmallVector<WireIterator>;
 
-      return layout;
-    }
-
-    /**
-     * @brief Insert program:hardware index mapping.
-     * @param prog The program index.
-     * @param hw The hardware index.
-     */
-    void add(IndexType prog, IndexType hw) {
-      assert(prog < programToHardware_.size() &&
-             "add: program index out of bounds");
-      assert(hw < hardwareToProgram_.size() &&
-             "add: hardware index out of bounds");
-      programToHardware_[prog] = hw;
-      hardwareToProgram_[hw] = prog;
-    }
-
-    /**
-     * @brief Look up program index for a hardware index.
-     * @param hw The hardware index.
-     * @return The program index of the respective hardware index.
-     */
-    [[nodiscard]] IndexType getProgramIndex(const IndexType hw) const {
-      assert(hw < hardwareToProgram_.size() &&
-             "getProgramIndex: hardware index out of bounds");
-      return hardwareToProgram_[hw];
-    }
-
-    /**
-     * @brief Look up hardware index for a program index.
-     * @param prog The program index.
-     * @return The hardware index of the respective program index.
-     */
-    [[nodiscard]] IndexType getHardwareIndex(const IndexType prog) const {
-      assert(prog < programToHardware_.size() &&
-             "getHardwareIndex: program index out of bounds");
-      return programToHardware_[prog];
-    }
-
-    /**
-     * @brief Convenience function to lookup multiple hardware indices at once.
-     * @param progs The program indices.
-     * @return A tuple of hardware indices.
-     */
-    template <typename... ProgIndices>
-      requires(sizeof...(ProgIndices) > 0) &&
-              ((std::is_convertible_v<ProgIndices, IndexType>) && ...)
-    [[nodiscard]] auto getHardwareIndices(ProgIndices... progs) const {
-      return std::tuple{getHardwareIndex(static_cast<IndexType>(progs))...};
-    }
-
-    /**
-     * @brief Convenience function to lookup multiple program indices at once.
-     * @param hws The hardware indices.
-     * @return A tuple of program indices.
-     */
-    template <typename... HwIndices>
-      requires(sizeof...(HwIndices) > 0) &&
-              ((std::is_convertible_v<HwIndices, size_t>) && ...)
-    [[nodiscard]] auto getProgramIndices(HwIndices... hws) const {
-      return std::tuple{getProgramIndex(static_cast<IndexType>(hws))...};
-    }
-
-    /**
-     * @brief Swap the mapping to program indices of two hardware indices.
-     */
-    void swap(const IndexType hw0, const IndexType hw1) {
-      const auto prog0 = hardwareToProgram_[hw0];
-      const auto prog1 = hardwareToProgram_[hw1];
-
-      std::swap(hardwareToProgram_[hw0], hardwareToProgram_[hw1]);
-      std::swap(programToHardware_[prog0], programToHardware_[prog1]);
-    }
-
-    /**
-     * @returns the number of qubits managed by the layout.
-     */
-    [[nodiscard]] size_t nqubits() const { return programToHardware_.size(); }
-
-    /**
-     * @returns the program to hardware mapping.
-     */
-    [[nodiscard]] ArrayRef<IndexType> getProgramToHardware() const {
-      return programToHardware_;
-    }
-
-  protected:
-    /**
-     * @brief Maps a program qubit index to its hardware index.
-     */
-    SmallVector<IndexType> programToHardware_;
-
-    /**
-     * @brief Maps a hardware qubit index to its program index.
-     */
-    SmallVector<IndexType> hardwareToProgram_;
+  enum class RoutingMode : bool { Cold, Hot };
 
-  private:
-    explicit Layout(const size_t nqubits)
-        : programToHardware_(nqubits), hardwareToProgram_(nqubits) {}
-  };
-
-  class [[nodiscard]] AugmentedDevice {
+  class AugmentedDevice {
   public:
-    AugmentedDevice() = default;
-
-    AugmentedDevice(size_t nqubits, const Edges& coupling)
-        : nqubits_(nqubits), dist_(findAllShortestPaths(nqubits, coupling)),
-          coupling_(coupling), neighbours_(nqubits) {
-      for (const auto& [u, v] : coupling_) {
-        neighbours_[u].push_back(v);
-      }
-    }
+    explicit AugmentedDevice(
+        const llvm::DenseSet<std::pair<size_t, size_t>>& couplingSet)
+        : coupling_(couplingSet), dist_(coupling_.getDistMatrix()) {}
 
-    /**
-     * @returns the device's number of qubits.
-     */
-    [[nodiscard]] size_t nqubits() const { return nqubits_; }
+    /// Return the device's number of qubits.
+    [[nodiscard]] size_t nqubits() const { return coupling_.getNumNodes(); }
 
-    /**
-     * @returns true if @p u and @p v are adjacent.
-     */
+    /// Return true if two qubits are adjacent.
     [[nodiscard]] bool areAdjacent(size_t u, size_t v) const {
-      return coupling_.contains(std::make_pair(u, v));
+      return dist_[u][v] == 1UL;
     }
 
-    /**
-     * @returns the length of the shortest path between @p u and @p v.
-     */
+    /// Return the length of the shortest path between two qubits.
     [[nodiscard]] size_t distanceBetween(size_t u, size_t v) const {
-      if (dist_[u][v] == UINT64_MAX) {
+      const auto dist = dist_[u][v];
+      if (dist == UINT64_MAX) {
         report_fatal_error("Failed to compute the distance between qubits " +
                            Twine(u) + " and " + Twine(v));
       }
-      return dist_[u][v];
+      return dist;
     }
 
-    /**
-     * @returns all neighbours of @p u.
-     */
-    [[nodiscard]] ArrayRef<size_t> neighboursOf(size_t u) const {
-      return neighbours_[u];
+    /// Return the qubit identifiers.
+    [[nodiscard]] SmallVector<size_t> qubits() const {
+      return coupling_.getNodes();
     }
 
-    /**
-     * @returns the max degree (connectivity) of any qubit of the device.
-     */
-    [[nodiscard]] size_t maxDegree() const {
-      size_t deg = 0;
-      for (const auto& nbrs : neighbours_) {
-        deg = std::max(deg, nbrs.size());
-      }
-      return deg;
+    /// Return all neighbours of a qubit.
+    [[nodiscard]] ArrayRef<size_t> neighboursOf(size_t u) const {
+      return coupling_.getNeighbours(u);
     }
 
+    /// Return the max degree (connectivity) of any qubit of the device.
+    [[nodiscard]] size_t maxDegree() const { return coupling_.getMaxDegree(); }
+
   private:
-    size_t nqubits_{};
-    Matrix dist_;
-    Edges coupling_;
-    Neighbours neighbours_;
+    Graph coupling_;
+    Graph::DistanceMatrix dist_;
   };
 
-  struct [[nodiscard]] Trial {
-    explicit Trial(Layout layout) : layout(std::move(layout)) {}
+  struct WireInfos {
+    /// Return the mapped wire index of a program index.
+    [[nodiscard]] size_t lookupIndex(size_t prog) const {
+      return programToIndex_.at(prog);
+    }
 
-    Layout layout;
-    size_t nswaps{};
-    bool success{false};
+    /// Return the mapped program index of a wire index.
+    [[nodiscard]] size_t lookupProgram(size_t index) const {
+      return indexToProgram_.at(index);
+    }
+
+    /// Bidirectionally map a wire index to a program index.
+    /// Overwrites existing mappings.
+    void map(size_t index, size_t prog) {
+      indexToProgram_[index] = prog;
+      programToIndex_[prog] = index;
+    }
+
+    /// Swap two program indices.
+    void swap(size_t prog0, size_t prog1) {
+      const auto i0 = lookupIndex(prog0);
+      const auto i1 = lookupIndex(prog1);
+      std::swap(programToIndex_[prog0], programToIndex_[prog1]);
+      std::swap(indexToProgram_[i0], indexToProgram_[i1]);
+    }
+
+  private:
+    /// Maps the i-th wire index to a program index.
+    DenseMap<size_t, size_t> indexToProgram_;
+    /// Maps a program index to the i-th wire index.
+    DenseMap<size_t, size_t> programToIndex_;
+  };
+
+  /// Statistics collected while routing.
+  struct Statistics {
+    size_t nswaps{0};
   };
 
-  /**
-   * @brief Parameters influencing the behavior of the A* search algorithm.
-   */
-  struct [[nodiscard]] Parameters {
+  /// Parameters influencing the behavior of the A* search algorithm.
+  struct Parameters {
     float alpha;
     float lambda;
   };
 
-  /**
-   * @brief Describes a node in the A* search graph.
-   */
-  struct [[nodiscard]] Node {
+  /// Utility-struct for routing functions.
+  struct RoutingBundle {
+    Wires wires;
+    WireInfos infos;
+    Layout layout;
+  };
+
+  /// Describes a node in the A* search graph.
+  struct Node {
     struct ComparePointer {
       bool operator()(const Node* lhs, const Node* rhs) const {
         return lhs->f > rhs->f;
@@ -305,17 +184,13 @@ struct MappingPass : impl::MappingPassBase<MappingPass> {
     size_t depth;
     float f;
 
-    /**
-     * @brief Construct a root node with the given layout. Initialize the
-     * sequence with an empty vector and set the cost to zero.
-     */
+    /// Construct a root node with the given layout. Initialize the
+    /// sequence with an empty vector and set the cost to zero.
     explicit Node(Layout layout)
         : layout(std::move(layout)), parent(nullptr), depth(0), f(0) {}
 
-    /**
-     * @brief Construct a non-root node from its parent node. Apply the given
-     * swap to the layout of the parent node.
-     */
+    /// Construct a non-root node from its parent node. Apply the given swap to
+    /// the layout of the parent node.
     Node(Node* parent, const IndexPairType& swap, const Window& window,
          const AugmentedDevice& device, const Parameters& params)
         : layout(parent->layout), swap(swap), parent(parent),
@@ -324,36 +199,29 @@ struct MappingPass : impl::MappingPassBase<MappingPass> {
       f = g(params.alpha) + h(window, device, params); // NOLINT
     }
 
-    /**
-     * @returns true if the current SWAP sequence makes all gates in the front
-     * executable.
-     */
+    /// Return true, if the current SWAP sequence makes all gates in the front
+    /// executable.
     [[nodiscard]] bool isGoal(const IndexPairType& front,
                               const AugmentedDevice& device) const {
-      return device.areAdjacent(layout.getHardwareIndex(front.first),
-                                layout.getHardwareIndex(front.second));
+      const auto [hw0, hw1] =
+          layout.getHardwareIndices(front.first, front.second);
+      return device.areAdjacent(hw0, hw1);
     }
 
   private:
-    /**
-     * @brief Calculate the path cost for the A* search algorithm.
-     *
-     * The path cost function is the weighted sum of the currently required
-     * SWAPs.
-     */
+    /// Calculate the path cost for the A* search algorithm.
+    /// The path costs are the weighted sum of the currently required SWAPs.
     [[nodiscard]] float g(const float alpha) const {
       return alpha * static_cast<float>(depth);
     }
 
-    /**
-     * @brief Calculate the heuristic cost for the A* search algorithm.
-     *
-     * Computes the minimal number of SWAPs required to route each gate in
-     * each layer. For each gate, this is determined by the shortest distance
-     * between its hardware qubits. Intuitively, this is the number of SWAPs
-     * that a naive router would insert to route the layers (with a constant
-     * layout).
-     */
+    /// Calculate the heuristic cost for the A* search algorithm.
+    ///
+    /// Computes the minimal number of SWAPs required to route each gate in
+    /// each layer. For each gate, this is determined by the shortest distance
+    /// between its hardware qubits. Intuitively, this is the number of SWAPs
+    /// that a naive router would insert to route the layers (with a constant
+    /// layout).
     [[nodiscard]] float h(const Window& window, const AugmentedDevice& device,
                           const Parameters& params) const {
       float costs{0};
@@ -371,25 +239,35 @@ struct MappingPass : impl::MappingPassBase<MappingPass> {
   };
 
 public:
+  /// Construct default mapping pass.
   MappingPass() = default;
+
+  /// Construct default mapping pass with options.
   explicit MappingPass(MappingPassOptions options) : MappingPassBase(options) {}
-  explicit MappingPass(size_t nqubits, const Edges& coupling,
-                       MappingPassOptions options = {})
-      : MappingPassBase(options), device(nqubits, coupling) {}
+
+  /// Construct mapping from coupling set.
+  explicit MappingPass(
+      const llvm::DenseSet<std::pair<size_t, size_t>>& couplingSet,
+      MappingPassOptions options)
+      : MappingPassBase(options),
+        device(std::make_shared<AugmentedDevice>(couplingSet)) {}
 
 protected:
   void runOnOperation() override {
-    assert(alpha > 0 && "runOnOperation: expected alpha > 0");
-    assert(niterations > 0 && "runOnOperation: expected niterations > 0");
-    assert(ntrials > 0 && "runOnOperation: expected ntrials > 0");
+    assert(alpha > 0 && "expected alpha > 0");
+    assert(niterations > 0 && "expected niterations > 0");
+    assert(ntrials > 0 && "expected ntrials > 0");
+
+    if (!device) {
+      llvm::reportFatalUsageError("No device specified!");
+    }
 
-    std::mt19937_64 rng{seed};
     IRRewriter rewriter(&getContext());
 
-    ModuleOp m = getOperation();
-    auto func = getEntryPoint(m);
+    auto mod = getOperation();
+    auto func = getEntryPoint(mod);
     if (!func) {
-      m.emitError() << "does not contain an entry point function";
+      mod.emitError() << "does not contain an entry point function";
       signalPassFailure();
       return;
     }
@@ -400,93 +278,135 @@ struct MappingPass : impl::MappingPassBase<MappingPass> {
       return;
     }
 
-    if (comp->size() > device.nqubits()) {
-      m.emitError() << "requires " + Twine(comp.value().size()) +
-                           " qubits. However, the architecture only supports " +
-                           Twine(device.nqubits()) + "qubits.";
+    auto& body = func.getFunctionBody();
+    auto& [wires, infos] = *comp;
+
+    if (wires.size() > device->nqubits()) {
+      func.emitError()
+          << "requires " + Twine(wires.size()) +
+                 " qubits. However, the architecture only supports " +
+                 Twine(device->nqubits()) + "qubits.";
       signalPassFailure();
       return;
     }
 
-    // Create trials for initial layout refining. Currently, this includes
-    // `ntrials` many random layouts.
-    SmallVector<Trial> trials;
-    trials.reserve(ntrials);
-    for (size_t i = 0; i < ntrials; ++i) {
-      trials.emplace_back(Layout::random(device.nqubits(), rng()));
-    }
-
-    // Execute each of the trials (possibly in parallel). Collect the results
-    // and find the one with the fewest SWAPs on the final backwards pass.
-    parallelForEach(&getContext(), trials, [&, this](Trial& trial) {
-      if (const auto res = refineLayout(*comp, trial.layout); succeeded(res)) {
-        trial.success = true;
-        trial.nswaps = *res;
-      }
-    });
-
-    Trial* best = findBestTrial(trials);
-    if (best == nullptr) {
-      func.emitError() << "failed to find the best layout trial";
+    auto layout = generateLayout(wires, infos);
+    if (failed(layout)) {
+      func->emitError() << "failed to refine random initial layouts.";
       signalPassFailure();
       return;
     }
 
-    // Perform placement and hot routing by inserting SWAPs into the IR.
-    auto placedWires = place(func, best->layout, rewriter);
+    std::tie(wires, infos) = std::move(place(body, *layout, rewriter));
+
+    Statistics stats;
+    RoutingBundle bundle{.wires = std::move(wires),
+                         .infos = std::move(infos),
+                         .layout = std::move(*layout)};
+
     const auto res = route<WireDirection::Forward, RoutingMode::Hot>(
-        placedWires, best->layout, &rewriter);
-    if (failed(res)) {
-      func.emitError() << "failed to map the " << func.getName() << " function";
+        bundle, stats, &rewriter);
+    if (res.failed()) {
+      func.emitError() << "failed to map the function";
       signalPassFailure();
       return;
     }
 
     // Collect statistics.
-    numSwaps += *res;
+    numSwaps += stats.nswaps;
 
     // Fix SSA Dominance issues.
-    for_each(func.getFunctionBody().getBlocks(),
-             [](Block& b) { sortTopologically(&b); });
+    llvm::for_each(body.getBlocks(), [](Block& b) { sortTopologically(&b); });
   }
 
 private:
-  /**
-   * @brief Collect wires of the quantum computation before placement.
-   * @details
-   * The mapping pass currently assumes that the quantum computation allocates
-   * all tensors at the start of the function. The required qubits are extracted
-   * from these tensors and used for the computation. Finally, the qubits are
-   * inserted back into the tensors at the end of the function.
-   * Thus, a valid program has the following structure:
-   *
-   *    T ⨉ [qtensor::AllocOp]
-   *  → N ⨉ [qtensor::ExtractOp]
-   *  → (Computation)
-   *  → N ⨉ [qtensor::InsertOp]
-   *  → T ⨉ [qtensor::DeallocOp]
-   *
-   * @returns a vector of wire iterator, or failure() if any of the above
-   * assumptions are violated.
-   */
-  static FailureOr<SmallVector<WireIterator>>
+  /// Extend the init arguments of an `scf::ForOp` by adding a given range of
+  /// additional SSA values. Replaces the existing operation and returns the
+  /// newly created one.
+  static scf::ForOp extend(scf::ForOp loop, ValueRange addons,
+                           IRRewriter& rewriter) {
+    OpBuilder::InsertionGuard guard(rewriter);
+    rewriter.setInsertionPoint(loop);
+
+    const auto naddons = addons.size();
+
+    SmallVector<Value> inits;
+    llvm::append_range(inits, loop.getInits());
+    llvm::append_range(inits, addons);
+
+    auto newLoop = rewriter.create<scf::ForOp>(
+        loop.getLoc(), loop.getLowerBound(), loop.getUpperBound(),
+        loop.getStep(), inits);
+
+    Block* loopBody = loop.getBody();
+    Block* newLoopBody = newLoop.getBody();
+
+    rewriter.mergeBlocks(
+        loopBody, newLoopBody,
+        newLoopBody->getArguments().take_front(loopBody->getNumArguments()));
+
+    for (const auto [before, after] :
+         llvm::zip_first(loop.getResults(), newLoop.getResults())) {
+      rewriter.replaceAllUsesWith(before, after);
+    }
+
+    for (const auto [before, after] :
+         llvm::zip_equal(addons, newLoop.getResults().take_back(naddons))) {
+      rewriter.replaceAllUsesExcept(before, after, newLoop);
+    }
+
+    auto yield = cast<scf::YieldOp>(newLoopBody->getTerminator());
+
+    SmallVector<Value> results;
+    llvm::append_range(results, yield.getResults());
+    llvm::append_range(results, newLoop.getRegionIterArgs().take_back(naddons));
+    rewriter.setInsertionPoint(yield);
+    rewriter.replaceOpWithNewOp<scf::YieldOp>(yield, results);
+
+    rewriter.eraseOp(loop);
+    return newLoop;
+  }
+
+  /// Return the wires of a dynamic computation.
+  /// The mapping pass currently assumes that
+  /// - there are no `qco.alloc` operation
+  /// - there is an "extraction" and "insertion" phase, where the i-th extract
+  ///   defines the i-th program qubit
+  /// Thus, supported programs have the following structure:
+  ///
+  ///   T ⨉ [qtensor::AllocOp]
+  /// → N ⨉ [qtensor::ExtractOp]
+  /// → (Computation)
+  /// → N ⨉ [qtensor::InsertOp]
+  /// → T ⨉ [qtensor::DeallocOp]
+  ///
+  /// If any of the above assumptions are violated, the function returns
+  /// failure.
+  static FailureOr<std::pair<Wires, WireInfos>>
   getComputation(func::FuncOp func) {
     if (!func.getOps<AllocOp>().empty()) {
-      func.emitError() << "must not contain qco.alloc operations";
-      return failure();
+      return func.emitError() << "must not contain qco.alloc operations";
     }
 
-    SmallVector<WireIterator> wires;
-    for (auto tensor : func.getOps<qtensor::AllocOp>()) {
+    Wires wires;
+    WireInfos infos;
+
+    for (auto alloc : func.getOps<qtensor::AllocOp>()) {
       bool isInitPhase = true;
-      TensorIterator it(tensor.getResult());
+      TensorIterator it(alloc.getResult());
       for (; it != std::default_sentinel; ++it) {
         if (auto extract = dyn_cast<ExtractOp>(it.operation())) {
           if (!isInitPhase) {
-            func.emitError() << "must extract and insert all qubits at once.";
-            return failure();
+            return func.emitError()
+                   << "must extract and insert all qubits at once.";
           }
-          wires.emplace_back(extract.getResult());
+
+          const auto qubit = extract.getResult();
+          const auto index = wires.size();
+
+          wires.emplace_back(qubit);
+          infos.map(index, index);
+
           continue;
         }
 
@@ -496,36 +416,37 @@ struct MappingPass : impl::MappingPassBase<MappingPass> {
         }
       }
     }
-    return wires;
+
+    return std::make_pair(wires, infos);
   }
 
-  /**
-   * @brief Perform placement by replacing dynamic with static qubits.
-   * @details
-   * Creates static qubits and replaces the extracted qubits with it.
-   * Moreover, the function extends the computation with as many static qubits
-   * as the architecture supports.
-   * @returns a vector of wire iterators, where the i-th wire points at the i-th
-   * static program qubit.
-   */
-  static SmallVector<WireIterator>
-  place(func::FuncOp func, const Layout& layout, IRRewriter& rewriter) {
+  /// Perform placement by
+  /// - initializing as many hardware qubits as the architecture supports
+  /// - replacing dynamic with static qubits
+  /// - extending the inputs of `scf::ForOp` to all hardware qubits.
+  ///
+  /// Analogously to the getComputation function, the i-th extract
+  /// operation defines the i-th program qubit.
+  static std::pair<Wires, WireInfos> place(Region& body, const Layout& layout,
+                                           IRRewriter& rewriter) {
     SmallVector<StaticOp> staticOps;
     staticOps.reserve(layout.nqubits());
 
     // Create and save static qubit operations.
-    rewriter.setInsertionPointToStart(&func.getFunctionBody().front());
+    rewriter.setInsertionPointToStart(&body.front());
     for (size_t i = 0; i < layout.nqubits(); ++i) {
-      const auto op = StaticOp::create(rewriter, func.getLoc(), i);
+      const auto op = StaticOp::create(rewriter, body.getLoc(), i);
       staticOps.emplace_back(op);
       rewriter.setInsertionPointAfter(op);
     }
 
     // Replace extract ops and collect in program-qubit order.
-    SmallVector<WireIterator> placedWires(layout.nqubits());
 
-    size_t prog = 0UL;
-    for (auto alloc : make_early_inc_range(func.getOps<qtensor::AllocOp>())) {
+    Wires wires;
+    WireInfos infos;
+
+    for (auto alloc :
+         llvm::make_early_inc_range(body.getOps<qtensor::AllocOp>())) {
       TensorIterator it(alloc.getResult());
       while (it != std::default_sentinel) {
         // Get the operation and early increment to avoid issues after erasure.
@@ -534,6 +455,7 @@ struct MappingPass : impl::MappingPassBase<MappingPass> {
 
         TypeSwitch<Operation*>(curr)
             .Case<ExtractOp>([&](auto op) {
+              const auto prog = wires.size();
               const auto hw = layout.getHardwareIndex(prog);
               const auto qubit = staticOps[hw].getQubit();
 
@@ -541,8 +463,8 @@ struct MappingPass : impl::MappingPassBase<MappingPass> {
               rewriter.replaceAllUsesWith(op.getOutTensor(), op.getTensor());
               rewriter.eraseOp(op);
 
-              placedWires[prog] = WireIterator(qubit);
-              ++prog;
+              wires.emplace_back(qubit);
+              infos.map(prog, prog);
             })
             .Case<InsertOp>([&](auto op) {
               rewriter.setInsertionPointAfter(op);
@@ -557,84 +479,138 @@ struct MappingPass : impl::MappingPassBase<MappingPass> {
     }
 
     // Create sinks for remaining, unused, static qubits.
-    rewriter.setInsertionPoint(func.getFunctionBody().back().getTerminator());
-    for (; prog < layout.nqubits(); ++prog) {
+
+    rewriter.setInsertionPoint(body.back().getTerminator());
+    for (size_t prog = wires.size(); prog < layout.nqubits(); ++prog) {
       const auto hw = layout.getHardwareIndex(prog);
       const auto qubit = staticOps[hw].getQubit();
-      placedWires[prog] = WireIterator(qubit);
-      SinkOp::create(rewriter, func->getLoc(), qubit);
-    }
 
-    return placedWires;
-  }
+      wires.emplace_back(qubit);
+      infos.map(prog, prog);
 
-  /**
-   * @brief Find the best trial result in terms of the number of SWAPs.
-   * @returns the best trial result or nullptr if no result is valid.
-   */
-  [[nodiscard]] static Trial* findBestTrial(MutableArrayRef<Trial> trials) {
-    Trial* best = nullptr;
-    for (auto& trial : trials) {
-      if (!trial.success) {
-        continue;
-      }
+      SinkOp::create(rewriter, body.getLoc(), qubit);
+    }
 
-      if (best == nullptr || best->nswaps > trial.nswaps) {
-        best = &trial;
+    // Finally, update the SCF operations such that they take all static qubits
+    // as input. To handle recursively nested SCF operations, use a stack of
+    // (region, mapping) pairs.
+
+    SmallVector<std::pair<Region&, DenseSet<Value>>> stack;
+    stack.emplace_back(body, DenseSet<Value>{});
+
+    while (!stack.empty()) {
+      auto [region, qubits] = stack.pop_back_val();
+
+      for (Operation& op : llvm::make_early_inc_range(region.getOps())) {
+        TypeSwitch<Operation*>(&op)
+            .Case<StaticOp>([&](StaticOp op) { qubits.insert(op.getQubit()); })
+            .Case<UnitaryOpInterface>([&](UnitaryOpInterface& op) {
+              for (const auto [pred, succ] :
+                   llvm::zip_equal(op.getInputQubits(), op.getOutputQubits())) {
+                qubits.insert(succ);
+                qubits.erase(pred);
+              }
+            })
+            .Case<scf::ForOp>([&](scf::ForOp loop) {
+              assert(qubits.size() == layout.nqubits());
+
+              DenseSet<Value> addons(qubits);
+              llvm::for_each(loop.getInits(), [&](auto v) { addons.erase(v); });
+              auto newLoop = extend(loop, to_vector(addons), rewriter);
+
+              for (OpOperand& operand : newLoop.getInitsMutable()) {
+                qubits.insert(newLoop.getTiedLoopResult(&operand));
+                qubits.erase(operand.get());
+              }
+
+              stack.emplace_back(
+                  newLoop.getRegion(),
+                  DenseSet<Value>(newLoop.getRegionIterArgs().begin(),
+                                  newLoop.getRegionIterArgs().end()));
+            })
+            .Case<ResetOp, MeasureOp>([&](auto op) {
+              qubits.insert(op.getQubitOut());
+              qubits.erase(op.getQubitIn());
+            })
+            .Case<AllocOp, qtensor::AllocOp>([&](auto) {
+              llvm::reportFatalInternalError("unexpected dynamic qubit alloc");
+            });
       }
     }
 
-    return best;
+    return std::make_pair(wires, infos);
   }
 
-  /**
-   * @brief Refine the trial's layout and count #swaps for the final backwards
-   * pass.
-   * @details Use the SABRE Approach to improve the initial layout:
-   * Traverse the layers from left-to-right-to-left and cold-route
-   * along the way. Repeat this procedure "niterations" times.
-   * @returns failure() if routing fails.
-   */
-  FailureOr<size_t> refineLayout(SmallVector<WireIterator> wires,
-                                 Layout& layout) {
-    size_t nswaps{0};
-    for (size_t i = 0; i < niterations; ++i) {
-      if (failed(route<WireDirection::Forward>(wires, layout))) {
-        return failure();
+  /// Execute `ntrials` many (parallel) initial layout refinement trials and
+  /// return the heuristically best one.
+  ///
+  /// The function uses the SABRE Approach to improve the initial layout:
+  /// Traverse the layers of the program from left-to-right-to-left and
+  /// cold-route along the way. Repeat this procedure "niterations" times and
+  /// finally find the trial with the fewest SWAPs on the final backwards pass
+  /// and return the respective layout.
+  FailureOr<Layout> generateLayout(const Wires& wires, const WireInfos& infos) {
+    std::mt19937_64 rng{seed};
+
+    struct Trial {
+      RoutingBundle bundle;
+      Statistics stats{};
+      bool success{false};
+    };
+
+    SmallVector<Trial, 0> trials;
+    trials.reserve(ntrials);
+    for (size_t i = 0; i < ntrials; ++i) {
+      trials.emplace_back(
+          RoutingBundle{.wires = wires,
+                        .infos = infos,
+                        .layout = Layout::random(device->nqubits(), rng())});
+    }
+
+    parallelForEach(&getContext(), trials, [&, this](Trial& t) {
+      for (size_t i = 0; i < niterations; ++i) {
+        if (route<WireDirection::Forward>(t.bundle, t.stats).failed()) {
+          return;
+        }
+        t.stats.nswaps = 0;
+        if (route<WireDirection::Backward>(t.bundle, t.stats).failed()) {
+          return;
+        }
       }
 
-      const auto resB = route<WireDirection::Backward>(wires, layout);
-      if (failed(resB)) {
-        return failure();
+      t.success = true;
+    });
+
+    Trial* best = nullptr;
+    for (Trial& t : trials) {
+      if (t.success &&
+          (best == nullptr || best->stats.nswaps > t.stats.nswaps)) {
+        best = &t;
       }
-      nswaps = *resB;
     }
 
-    return nswaps;
+    if (best == nullptr) {
+      return failure();
+    }
+
+    return best->bundle.layout;
   }
 
-  /**
-   * @brief Perform A* search to find a sequence of SWAPs that makes the
-   * two-qubit operations inside the first layer (the front) executable.
-   * @details
-   * The iteration budget is b^{3} node expansions, i.e. roughly a depth-3
-   * search in a tree with branching factor b. A hard cap prevents impractical
-   * runtimes on larger architectures.
-   *
-   * The branching factor b of the A* search is the product of the
-   * architecture's maximum qubit degree and the maximum number of two-qubit
-   * gates in any layer:
-   *
-   * b = maxDegree × ⌈N/2⌉
-   *
-   * @returns a vector of hardware-index pairs (each denoting a SWAP) or
-   * failure() if A* fails.
-   */
-  [[nodiscard]] FailureOr<SmallVector<IndexPairType>>
-  search(const Window& window, const Layout& layout) {
+  /// Perform A* search to find a sequence of SWAPs that makes all two-qubit ops
+  /// inside the first layer executable.
+  ///
+  /// The iteration budget is b^{3} node expansions, i.e. roughly a depth-3
+  /// search in a tree with branching factor b, where b is the product of the
+  /// architecture's maximum qubit degree and the maximum number of two-qubit
+  /// gates in any layer: `b = maxDegree × ⌈N/2⌉`. A hard cap prevents
+  /// impractical runtimes on larger architectures.
+  ///
+  /// Returns `failure`, if the A* search fails.
+  FailureOr<SmallVector<IndexPairType>> search(const Window& window,
+                                               const Layout& layout) {
     constexpr size_t cap = 25'000'000UL;
 
-    const size_t b = device.maxDegree() * ((device.nqubits() + 1) / 2);
+    const size_t b = device->maxDegree() * ((device->nqubits() + 1) / 2);
     const size_t budget = std::min(b * b * b, cap);
 
     const Parameters params{.alpha = alpha, .lambda = lambda};
@@ -645,13 +621,13 @@ struct MappingPass : impl::MappingPassBase<MappingPass> {
 
     // Early exit, if the root node is a goal node already.
     Node* root = std::construct_at(arena.Allocate(), layout);
-    if (root->isGoal(window.front(), device)) {
+    if (root->isGoal(window.front(), *device)) {
       return SmallVector<IndexPairType>{};
     }
 
     frontier.emplace(root);
 
-    DenseMap<ArrayRef<IndexType>, size_t> bestDepth;
+    DenseMap<ArrayRef<size_t>, size_t> bestDepth;
     DenseSet<IndexPairType> expansionSet;
 
     size_t i = 0;
@@ -676,10 +652,10 @@ struct MappingPass : impl::MappingPassBase<MappingPass> {
         it->second = curr->depth;
       }
 
-      // If the currently visited node is a goal node, reconstruct the sequence
-      // of SWAPs from this node to the root.
+      // If the currently visited node is a goal node, reconstruct the
+      // sequence of SWAPs from this node to the root.
 
-      if (curr->isGoal(window.front(), device)) {
+      if (curr->isGoal(window.front(), *device)) {
         SmallVector<IndexPairType> seq(curr->depth);
         size_t j = seq.size() - 1;
         for (Node* n = curr; n->parent != nullptr; n = n->parent) {
@@ -697,7 +673,7 @@ struct MappingPass : impl::MappingPassBase<MappingPass> {
       const auto& [q0, q1] = window.front();
       for (const auto prog : {q0, q1}) {
         for (const auto hw0 = curr->layout.getHardwareIndex(prog);
-             const auto hw1 : device.neighboursOf(hw0)) {
+             const auto hw1 : device->neighboursOf(hw0)) {
           // Ensure consistent hashing/comparison.
           const IndexPairType swap = std::minmax(hw0, hw1);
           if (!expansionSet.insert(swap).second) {
@@ -705,95 +681,155 @@ struct MappingPass : impl::MappingPassBase<MappingPass> {
           }
 
           frontier.emplace(std::construct_at(arena.Allocate(), curr, swap,
-                                             window, device, params));
+                                             window, *device, params));
         }
       }
 
       ++i;
     }
 
+    llvm::dbgs() << i << '\n';
+
     return failure();
   }
 
-  /**
-   * @brief Skip a qubit-pair block.
-   * @details Traverses the pair of wire iterators in tandem until a two-qubit
-   * operation is found. If the two-qubit operation is equivalent, continue.
-   * Otherwise stop.
-   */
-  template <WireDirection Direction>
-  static void skipQubitPairBlock(WireIterator& w0, WireIterator& w1) {
-    using Traits = WireTraversalTraits<Direction>;
+  /// Return the sequence of SWAPs to move from one layout to another.
+  /// Implements the 4-Approximation algorithm described in arXiv:1602.05150v3.
+  SmallVector<IndexPairType> restore(const Layout& from, const Layout& to) {
+    static constexpr size_t MIN_CYCLE_LENGTH = 2;
 
-    WireIterator curr0(w0);
-    WireIterator curr1(w1);
-    while (true) {
-      while (Traits::isActive(curr0)) {
-        std::ranges::advance(curr0, Traits::stride());
-      }
+    Graph f;
+    Layout curr(from);
+    SmallVector<IndexPairType> swaps;
 
-      if (curr0 == std::default_sentinel) {
-        return;
-      }
+    const auto shouldAddEdge = [&](size_t u, size_t v) {
+      const auto prog = curr.getProgramIndex(u);
+      const auto hwGoal = to.getHardwareIndex(prog);
+      return device->distanceBetween(v, hwGoal) <
+             device->distanceBetween(u, hwGoal);
+    };
+
+    do {
+      f.clear();
 
-      while (Traits::isActive(curr1)) {
-        std::ranges::advance(curr1, Traits::stride());
+      // Build F-graph: Add edges to F for each edge in the coupling graph.
+      // Note that this assumes that the coupling graph is directed, but
+      // symmetric (essentially: undirected).
+      for (const auto u : device->qubits()) {
+        for (const auto v : device->neighboursOf(u)) {
+          if (shouldAddEdge(u, v)) {
+            f.addEdge(u, v);
+          }
+        }
       }
 
-      if (curr1 == std::default_sentinel) {
-        return;
+      // Try to find a directed cycle in the F graph. If there is one,
+      // we can apply a happy swap chain. Note that this happy swap chain
+      // does not include the final back edge closing the cycle because the
+      // first SWAP changes the token (the qubit) on the target, invalidating
+      // the edge in F.
+
+      if (const auto cycle = f.findCycle();
+          cycle && cycle->size() >= MIN_CYCLE_LENGTH) {
+        for (size_t i = 0; i + 1 < cycle->size(); ++i) {
+          curr.swap((*cycle)[i], (*cycle)[i + 1]);
+          swaps.emplace_back((*cycle)[i], (*cycle)[i + 1]);
+        }
+        continue;
       }
 
-      if (curr0.operation() != curr1.operation()) {
-        return;
+      for (const auto v : f.getNodes()) {
+        if (f.getDegree(v) == 0) {
+          if (const auto nbrs = device->neighboursOf(v); !nbrs.empty()) {
+            const auto u = nbrs.front();
+            curr.swap(u, v);
+            swaps.emplace_back(u, v);
+          }
+          break;
+        }
       }
+    } while (!f.empty());
+
+    return swaps;
+  }
 
-      // Handle two-qubit barrier edge case explicitly.
-      if (auto barrier = dyn_cast<BarrierOp>(curr0.operation())) {
-        if (barrier.getNumQubits() != 2) {
+  /// Skip to the end of the two-qubit block for both wire iterators, where
+  /// initially both must point at the same two-qubit operation.
+  template <WireDirection Direction>
+  static void skipQubitPairBlock(WireIterator& it0, WireIterator& it1) {
+    using Traits = WireTraversalTraits<Direction>;
+
+    // Traverses the pair of wire iterators in tandem until a two-qubit
+    // operation is found. If the two-qubit operation is equivalent, continue.
+    // Otherwise stop.
+
+    std::array<WireIterator, 2> block{it0, it1};
+    while (true) {
+      for (auto& it : block) {
+        while (Traits::isActive(it)) {
+          std::ranges::advance(it, Traits::stride());
+
+          if (it.operation() == nullptr) { // isa<Blockargument>
+            return;
+          }
+
+          if (auto u = dyn_cast<UnitaryOpInterface>(it.operation());
+              u && u.getNumQubits() > 1) {
+            // Handle two-qubit barrier edge case explicitly.
+            if (isa<BarrierOp>(u) && u.getNumQubits() != 2) {
+              return;
+            }
+            // Otherwise stop for subsequent two-qubit unitary comparison.
+            break;
+          }
+        }
+
+        if (it == std::default_sentinel) {
           return;
         }
       }
 
-      w0 = curr0;
-      w1 = curr1;
+      if (block[0].operation() != block[1].operation()) {
+        return;
+      }
+
+      it0 = block[0];
+      it1 = block[1];
     }
   }
 
-  /**
-   * @brief Build and return window of layers.
-   * @details Traverses the circuit-layers until the desired window sizes is
-   * reached. Assumes that wires[i] = i-th program qubit. The size of the window
-   * is 1 + nlookahead.
-   * @returns window of layers.
-   */
+  /// Return a window of layers with a maximum size of `1 + nlookahead`.
   template <WireDirection Direction>
-  Window getWindow(ArrayRef<WireIterator> baseWires) {
+  Window getWindow(Wires wires, const WireInfos& infos) {
     Window window;
     window.reserve(1 + nlookahead);
 
-    SmallVector<WireIterator> wires(baseWires);
-    std::ignore = walkProgramGraph<Direction>(
+    walkProgramGraph<Direction>(
         wires, [&](const ReadyRange& ready, ReleasedOps& released) {
           if (ready.empty()) {
             return WalkResult::advance();
           }
 
-          for (const auto& [op, progs] : ready) {
-            if (isa<BarrierOp>(op)) {
-              released.emplace_back(op);
-              continue;
-            }
+          for (const auto& [op, indices] : ready) {
+            if (auto u = dyn_cast<UnitaryOpInterface>(op)) {
+              const auto i0 = indices[0];
+              const auto i1 = indices[1];
+
+              const auto prog0 = infos.lookupProgram(i0);
+              const auto prog1 = infos.lookupProgram(i1);
 
-            const auto p0 = progs[0];
-            const auto p1 = progs[1];
-            window.emplace_back(p0, p1);
-            if (window.size() == 1 + nlookahead) {
-              return WalkResult::interrupt();
+              window.emplace_back(prog0, prog1);
+              if (window.size() == 1 + nlookahead) {
+                return WalkResult::interrupt();
+              }
+
+              skipQubitPairBlock<Direction>(wires[i0], wires[i1]);
+              released.emplace_back(u);
+              return WalkResult::advance();
             }
 
-            skipQubitPairBlock<Direction>(wires[p0], wires[p1]);
-            released.emplace_back(wires[p0].operation());
+            released.emplace_back(op);
+            return WalkResult::advance();
           }
 
           return WalkResult::advance();
@@ -802,145 +838,243 @@ struct MappingPass : impl::MappingPassBase<MappingPass> {
     return window;
   }
 
-  /**
-   * @brief Advance past all executable gates.
-   * @details Traverses the multi-qubit gates of the circuit until no more
-   * executable gates are found.
-   */
+  /// Insert SWAP operations, exchanging two qubits, virtually
+  /// (`RoutingMode::Cold`) or into the IR (`RoutingMode::Hot`). The function
+  /// expects that each wire points at the correct insertion point.
+  template <RoutingMode Mode>
+  static void insertSWAPs(ArrayRef<IndexPairType> swaps, RoutingBundle& bundle,
+                          Statistics& stats, IRRewriter* rewriter) {
+    auto& [wires, infos, layout] = bundle;
+    for (const auto& [hw0, hw1] : swaps) {
+      if constexpr (Mode == RoutingMode::Hot) {
+        const auto [prog0, prog1] = layout.getProgramIndices(hw0, hw1);
+
+        const auto i0 = infos.lookupIndex(prog0);
+        const auto i1 = infos.lookupIndex(prog1);
+
+        auto& w0 = wires[i0];
+        auto& w1 = wires[i1];
+
+        const auto in0 = w0.qubit();
+        const auto in1 = w1.qubit();
+
+        rewriter->setInsertionPointAfterValue(in0); // Valid bc. Hot => Forward.
+        auto swapOp = SWAPOp::create(*rewriter, in0.getLoc(), in0, in1);
+
+        const auto out0 = swapOp.getQubit0Out();
+        const auto out1 = swapOp.getQubit1Out();
+
+        rewriter->replaceAllUsesExcept(in0, out1, swapOp);
+        rewriter->replaceAllUsesExcept(in1, out0, swapOp);
+
+        infos.swap(prog0, prog1);
+
+        std::advance(w0, 1); // Move to SWAP.
+        std::advance(w1, 1);
+      }
+
+      layout.swap(hw0, hw1);
+    }
+
+    stats.nswaps += swaps.size();
+  }
+
+  /// Advance past all executable gates and return operations with nested
+  /// regions and the respective wire indices. Stops when no more executable
+  /// gates are found. After the function returns, the wires point at the
+  /// results of non-executable gates or operations with nested regions.
   template <WireDirection Direction>
-  void skipExecutableGates(MutableArrayRef<WireIterator> wires,
-                           Layout& layout) {
-    std::ignore = walkProgramGraph<Direction>(
-        wires, [&](const ReadyRange& ready, ReleasedOps& released) {
-          if (ready.empty()) {
-            return WalkResult::advance();
-          }
+  SmallVector<std::pair<Operation*, SmallVector<size_t>>>
+  advance(Wires& wires, const WireInfos& infos, const Layout& layout) {
+    SmallVector<std::pair<Operation*, SmallVector<size_t>>> stack;
+
+    // Advance wires past all executable gates and push operations with
+    // nested regions and the respective wire indices of their inputs onto the
+    // result stack.
+
+    walkProgramGraph<Direction>(wires, [&](const ReadyRange& ready,
+                                           ReleasedOps& released) {
+      if (ready.empty()) {
+        return WalkResult::advance();
+      }
 
-          for (const auto& [op, progs] : ready) {
-            if (isa<BarrierOp>(op)) {
-              released.emplace_back(op);
-              continue;
-            }
+      for (const auto& [readyOp, indices] : ready) {
+        TypeSwitch<Operation*>(readyOp)
+            .template Case<BarrierOp>(
+                [&](BarrierOp op) { released.emplace_back(op); })
+            .template Case<UnitaryOpInterface>([&](UnitaryOpInterface op) {
+              const auto prog0 = infos.lookupProgram(indices[0]);
+              const auto prog1 = infos.lookupProgram(indices[1]);
+              const auto [hw0, hw1] = layout.getHardwareIndices(prog0, prog1);
+              if (device->areAdjacent(hw0, hw1)) {
+                released.emplace_back(op);
+              }
+            })
+            .template Case<scf::ForOp>(
+                [&](scf::ForOp op) { stack.emplace_back(op, indices); });
+      }
+
+      if (released.empty()) {
+        return WalkResult::interrupt();
+      }
 
-            const auto [hw0, hw1] =
-                layout.getHardwareIndices(progs[0], progs[1]);
+      return WalkResult::advance();
+    });
+
+    return stack;
+  }
 
-            if (device.areAdjacent(hw0, hw1)) {
-              released.emplace_back(op);
+  /// Iterates over a dynamically computed window of layers and uses A* search
+  /// to find a SWAP sequence that makes each layer executable. Depending on
+  /// the template parameter, this function only updates the layout or also
+  /// inserts the SWAPs into the IR. The function returns `failure` if A* is
+  /// unable to find a solution.
+  template <WireDirection Direction, RoutingMode Mode = RoutingMode::Cold>
+    requires(Mode != RoutingMode::Hot || Direction == WireDirection::Forward)
+  LogicalResult route(RoutingBundle& bundle, Statistics& stats,
+                      IRRewriter* rewriter = nullptr) {
+    using Traits = WireTraversalTraits<Direction>;
+
+    auto& [wires, infos, layout] = bundle;
+
+    while (true) {
+
+      while (true) {
+        const auto stack = advance<Direction>(wires, infos, layout);
+
+        if (stack.empty()) {
+          break;
+        }
+
+        // Continue with processing the nested regions recursively.
+
+        for (const auto& [op, indices] : stack) {
+          assert(isa<scf::ForOp>(op));
+          auto forOp = cast<scf::ForOp>(op);
+
+          RoutingBundle child{.layout = layout};
+
+          // Map parent (results) to child values (iter args). Going forwards,
+          // the recursive routing starts at block arguments, while the
+          // backwards go starts at the yielded values.
+
+          for (size_t i : indices) {
+            const auto prog = infos.lookupProgram(i);
+            const auto res = cast<OpResult>(wires[i].qubit());
+            const auto arg = forOp.getTiedLoopRegionIterArg(res);
+            const auto index = child.wires.size();
+
+            if constexpr (Direction == WireDirection::Forward) {
+              child.wires.emplace_back(arg);
+              child.infos.map(index, prog);
+            } else {
+              const auto yield = forOp.getTiedLoopYieldedValue(arg)->get();
+              child.wires.emplace_back(yield);
+              child.infos.map(index, prog);
             }
           }
 
-          // Stop, if there are no more ready AND executable gates.
-          if (released.empty()) {
-            return WalkResult::interrupt();
+          const auto res = route<Direction, Mode>(child, stats, rewriter);
+          if (failed(res)) {
+            return failure();
           }
 
-          return WalkResult::advance();
-        });
-  }
+          const auto swaps = restore(child.layout, layout);
 
-  /**
-   * @brief Route via SWAP insertion.
-   * @details Iterates over a dynamically computed window of layers and uses A*
-   * search to find a sequence of SWAPs that makes that layer executable.
-   * Depending on the template parameter, this function only updates
-   * (and hence modifies) the layout or also inserts the SWAPs into the IR.
-   * @returns failure() if A* search isn't able to find a solution, the number
-   * of SWAPs otherwise.
-   */
-  template <WireDirection Direction, RoutingMode mode = RoutingMode::Cold>
-  FailureOr<size_t> route(SmallVector<WireIterator>& wires, Layout& layout,
-                          IRRewriter* rewriter = nullptr) {
-    using Traits = WireTraversalTraits<Direction>;
+          if constexpr (Mode == RoutingMode::Hot) {
 
-    size_t nswaps{0};
-    while (true) {
-      skipExecutableGates<Direction>(wires, layout);
+            // After routing the loop body, all iterators point to
+            // std::default_sentinel. To move the iterators to the correct
+            // qubit SSA values for the epilogue SWAPs, decrement each
+            // twice: (sentinel → yield → unitary/block arg).
 
-      const auto window = getWindow<Direction>(wires);
-      if (window.empty()) {
-        break;
-      }
+            llvm::for_each(child.wires, [](auto& it) { std::advance(it, -2); });
+          }
 
-      if constexpr (mode == RoutingMode::Hot) {
+          insertSWAPs<Mode>(swaps, child, stats, rewriter);
 
-        // At this point the wire iterators either point to
-        // std::default_sentinel or a multi-qubit gate (including barriers) of
-        // the current or subsequent layers. The former must be decremented
-        // twice (sentinel -> sink -> unitary/static). For the latter we simply
-        // must ensure the insertion point is before the multi-qubit gates.
+          if constexpr (Mode == RoutingMode::Hot) {
+            sortTopologically(forOp.getBody());
+          }
 
-        for (auto& it : wires) {
-          std::ranges::advance(it, it == std::default_sentinel
-                                       ? -2 * Traits::stride()
-                                       : -Traits::stride());
+          // Finally, move past the operation with nested regions by
+          // incrementing the respective global wires.
+
+          llvm::for_each(indices, [&](size_t i) {
+            std::advance(wires[i], Traits::stride());
+          });
         }
       }
 
+      const auto window = getWindow<Direction>(wires, infos);
+      if (window.empty()) {
+        break;
+      }
+
       const auto swaps = search(window, layout);
       if (failed(swaps)) {
         return failure();
       }
 
-      for (const auto& [hw0, hw1] : *swaps) {
-        if constexpr (mode == RoutingMode::Hot) {
-          const auto& [prog0, prog1] = layout.getProgramIndices(hw0, hw1);
-          const auto& w0 = wires[prog0];
-          const auto& w1 = wires[prog1];
-
-          assert(!isa<SinkOp>(w0.operation()));
-          assert(!isa<SinkOp>(w1.operation()));
-
-          const auto in0 = w0.qubit();
-          const auto in1 = w1.qubit();
-
-          rewriter->setInsertionPointAfter(in0.getDefiningOp());
-          auto swapOp = SWAPOp::create(*rewriter, in0.getLoc(), in0, in1);
-
-          const auto out0 = swapOp.getQubit0Out();
-          const auto out1 = swapOp.getQubit1Out();
-
-          rewriter->replaceAllUsesExcept(in0, out1, swapOp);
-          rewriter->replaceAllUsesExcept(in1, out0, swapOp);
+      if constexpr (Mode == RoutingMode::Hot) {
 
-          // Preserve program-indexed wire semantics.
-          wires[prog0] = WireIterator(out1);
-          wires[prog1] = WireIterator(out0);
+        // At this point the wire iterators either point to
+        // std::default_sentinel or a multi-qubit gate (incl. barriers) of
+        // the current or subsequent layers. The former must be decremented
+        // twice (sentinel → sink → unitary/static). For the latter, we
+        // must ensure the insertion point is before the multi-qubit gates.
 
-          assert(isa<SWAPOp>(w0.operation()));
-          assert(isa<SWAPOp>(w1.operation()));
+        for (auto& it : wires) {
+          std::advance(it, it == std::default_sentinel ? -2 : -1);
         }
-        layout.swap(hw0, hw1);
       }
 
-      if constexpr (mode == RoutingMode::Hot) {
+      insertSWAPs<Mode>(*swaps, bundle, stats, rewriter);
+
+      if constexpr (Mode == RoutingMode::Hot) {
 
         // After SWAP insertion, a wire is either untouched by the SWAP
-        // insertion or pointing at a SWAP operation. If the former is the case,
-        // incrementing the wire iterator will undo the previous decrement,
-        // leaving it at the same position as before the SWAP insertion.
-        // Otherwise, an increment will move the iterator to the multi-qubit op
-        // of the current or subsequent layer or to a sink (and thus
-        // std::default_sentinel).
-
-        for_each(wires,
-                 [](auto& it) { std::ranges::advance(it, Traits::stride()); });
+        // insertion or pointing at a SWAP operation. If the former is the
+        // case, incrementing the wire iterator will undo the previous
+        // decrement, leaving it at the same position as before the SWAP
+        // insertion. Otherwise, an increment will move the iterator to the
+        // multi-qubit op of the current or subsequent layer or to a sink (and
+        // thus std::default_sentinel).
+
+        llvm::for_each(wires, [](auto& it) { std::advance(it, 1); });
       }
-
-      nswaps += swaps->size();
     }
 
-    return nswaps;
+    return success();
   }
 
-  AugmentedDevice device;
+  std::shared_ptr<AugmentedDevice> device;
 };
 
 } // namespace
 
-std::unique_ptr<Pass> createMappingPass(size_t nqubits, const Edges& coupling,
-                                        MappingPassOptions options) {
-  return std::make_unique<MappingPass>(nqubits, coupling, options);
+std::unique_ptr<Pass>
+createMappingPass(const llvm::DenseSet<std::pair<size_t, size_t>>& couplingSet,
+                  MappingPassOptions options) {
+
+  // Verify the assumption that the coupling set is symmetric:
+  // For every edge (u, v) in the set, (v, u) must also be present.
+
+  for (const auto& [u, v] : couplingSet) {
+    if (u == v) {
+      llvm::reportFatalUsageError("Found an invalid (u, u) edge.");
+      return nullptr;
+    }
+
+    if (!couplingSet.contains(std::make_pair(v, u))) {
+      llvm::reportFatalUsageError("Expected symmetric coupling set: edge (" +
+                                  Twine(u) + ", " + Twine(v) +
+                                  ") exists but (" + Twine(v) + ", " +
+                                  Twine(u) + ") does not.");
+    }
+  }
+
+  return std::make_unique<MappingPass>(couplingSet, options);
 }
 
 } // namespace mlir::qco
diff --git a/mlir/lib/Dialect/QCO/Utils/Algorithms.cpp b/mlir/lib/Dialect/QCO/Utils/Algorithms.cpp
deleted file mode 100644
index c8da5817f0..0000000000
--- a/mlir/lib/Dialect/QCO/Utils/Algorithms.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (c) 2023 - 2026 Chair for Design Automation, TUM
- * Copyright (c) 2025 - 2026 Munich Quantum Software Company GmbH
- * All rights reserved.
- *
- * SPDX-License-Identifier: MIT
- *
- * Licensed under the MIT License
- */
-
-#include "mlir/Dialect/QCO/Utils/Algorithms.h"
-
-#include <llvm/ADT/SmallVector.h>
-#include <mlir/Support/LLVM.h>
-
-#include <algorithm>
-#include <cstddef>
-#include <cstdint>
-
-namespace mlir::qco {
-Matrix findAllShortestPaths(size_t n, const Edges& edges) {
-  Matrix dist(n, SmallVector<size_t>(n, UINT64_MAX));
-
-  for (const auto& [u, v] : edges) {
-    dist[u][v] = 1;
-  }
-  for (std::size_t v = 0; v < n; ++v) {
-    dist[v][v] = 0;
-  }
-
-  for (std::size_t k = 0; k < n; ++k) {
-    for (std::size_t i = 0; i < n; ++i) {
-      for (std::size_t j = 0; j < n; ++j) {
-        if (dist[i][k] == UINT64_MAX || dist[k][j] == UINT64_MAX) {
-          continue; // Avoid overflow with "infinite" distances.
-        }
-
-        const std::size_t sum = dist[i][k] + dist[k][j];
-        dist[i][j] = std::min(dist[i][j], sum);
-      }
-    }
-  }
-
-  return dist;
-}
-} // namespace mlir::qco
diff --git a/mlir/lib/Dialect/QCO/Utils/Graph.cpp b/mlir/lib/Dialect/QCO/Utils/Graph.cpp
new file mode 100644
index 0000000000..81913d67ea
--- /dev/null
+++ b/mlir/lib/Dialect/QCO/Utils/Graph.cpp
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2023 - 2026 Chair for Design Automation, TUM
+ * Copyright (c) 2025 - 2026 Munich Quantum Software Company GmbH
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Licensed under the MIT License
+ */
+
+#include "mlir/Dialect/QCO/Utils/Graph.h"
+
+#include <llvm/ADT/ArrayRef.h>
+#include <llvm/ADT/STLExtras.h>
+#include <llvm/ADT/SmallVector.h>
+#include <mlir/Support/LLVM.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <optional>
+#include <utility>
+
+namespace mlir::qco {
+void Graph::addEdge(size_t u, size_t v) {
+  adj_[u].emplace_back(v);
+  std::ignore = adj_[v]; // Ensure v exists in the map.
+}
+
+ArrayRef<size_t> Graph::getNeighbours(size_t id) const { return adj_.at(id); }
+SmallVector<size_t> Graph::getNodes() const { return to_vector(adj_.keys()); }
+
+size_t Graph::getMaxDegree() const {
+  size_t deg = 0;
+  for (const auto& [u, nbrs] : adj_) {
+    deg = std::max(deg, nbrs.size());
+  }
+  return deg;
+}
+
+Graph::DistanceMatrix Graph::getDistMatrix() const {
+  const auto n = getNumNodes();
+
+  Graph::DistanceMatrix dist(n, UINT64_MAX);
+  for (const auto& [u, nbrs] : adj_) {
+    for (const auto& v : nbrs) {
+      dist[u][v] = 1;
+    }
+  }
+  for (size_t v = 0; v < n; ++v) {
+    dist[v][v] = 0;
+  }
+
+  for (size_t k = 0; k < n; ++k) {
+    for (size_t i = 0; i < n; ++i) {
+      for (size_t j = 0; j < n; ++j) {
+        if (dist[i][k] == UINT64_MAX || dist[k][j] == UINT64_MAX) {
+          continue; // Avoid overflow with "infinite" distances.
+        }
+
+        const size_t sum = dist[i][k] + dist[k][j];
+        dist[i][j] = std::min(dist[i][j], sum);
+      }
+    }
+  }
+
+  return dist;
+}
+
+std::optional<SmallVector<size_t>> Graph::findCycle() const {
+  enum struct State : uint8_t { Unseen, Seen, Finished };
+
+  struct Frame {
+    size_t id;
+    size_t neighbourIdx;
+  };
+
+  SmallVector<Frame> stack;
+  llvm::DenseMap<size_t, size_t> parents;
+  llvm::DenseMap<size_t, State> states;
+
+  // Preparation step: Mark all nodes as unseen.
+  llvm::for_each(adj_.keys(), [&](size_t id) { states[id] = State::Unseen; });
+
+  for (const auto initId : adj_.keys()) {
+    // Only start from unseen nodes.
+    if (states[initId] != State::Unseen) {
+      continue;
+    }
+
+    stack.emplace_back(initId, 0);
+
+    while (!stack.empty()) {
+      Frame& top = stack.back();
+
+      // If we haven't seen this node before, mark it as seen.
+      if (states[top.id] == State::Unseen) {
+        states[top.id] = State::Seen;
+      }
+
+      auto it = adj_.find(top.id);
+      assert(it != adj_.end() && "expected node id in adjacency map");
+      const auto nbrs = it->getSecond();
+
+      // Once all neighbours have been visited (indicated by the index
+      // exceeding the number of neighbours - 1), set the frame on node to
+      // finished and pop it from the stack.
+      if (top.neighbourIdx >= nbrs.size()) {
+        states[top.id] = State::Finished;
+        stack.pop_back();
+        continue;
+      }
+
+      // Collect the neighbour and advance the index on the
+      // frame for the next iteration.
+      const auto nbrId = nbrs[top.neighbourIdx];
+      ++top.neighbourIdx;
+
+      if (states[nbrId] == State::Unseen) {
+        parents[nbrId] = top.id;
+        stack.emplace_back(nbrId, 0);
+      } else if (states[nbrId] == State::Seen) {
+        SmallVector<size_t> path;
+        for (auto curr = top.id; curr != nbrId; curr = parents[curr]) {
+          path.emplace_back(curr);
+        }
+        path.emplace_back(nbrId);
+        std::ranges::reverse(path);
+        return path;
+      }
+    }
+
+    // Preparse stack for next iteration.
+    stack.clear();
+  }
+
+  return std::nullopt;
+}
+
+} // namespace mlir::qco
diff --git a/mlir/lib/Dialect/QCO/Utils/Layout.cpp b/mlir/lib/Dialect/QCO/Utils/Layout.cpp
new file mode 100644
index 0000000000..9f0a3b4209
--- /dev/null
+++ b/mlir/lib/Dialect/QCO/Utils/Layout.cpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2023 - 2026 Chair for Design Automation, TUM
+ * Copyright (c) 2025 - 2026 Munich Quantum Software Company GmbH
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Licensed under the MIT License
+ */
+
+#include "mlir/Dialect/QCO/Utils/Layout.h"
+
+#include <llvm/ADT/STLExtras.h>
+#include <llvm/ADT/SmallVector.h>
+#include <mlir/Support/LLVM.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <numeric>
+#include <random>
+
+namespace mlir::qco {
+Layout Layout::random(const size_t nqubits, const size_t seed) {
+  SmallVector<size_t> mapping(nqubits);
+  std::iota(mapping.begin(), mapping.end(), size_t{0});
+  std::ranges::shuffle(mapping, std::mt19937_64{seed});
+
+  Layout layout(nqubits);
+  for (const auto [prog, hw] : enumerate(mapping)) {
+    layout.add(prog, hw);
+  }
+
+  return layout;
+}
+
+void Layout::add(const size_t prog, const size_t hw) {
+  assert(prog < programToHardware_.size() && "program index out of bounds");
+  assert(hw < hardwareToProgram_.size() && "hardware index out of bounds");
+  programToHardware_[prog] = hw;
+  hardwareToProgram_[hw] = prog;
+}
+
+size_t Layout::getProgramIndex(const size_t hw) const {
+  assert(hw < hardwareToProgram_.size() && "hardware index out of bounds");
+  return hardwareToProgram_[hw];
+}
+
+size_t Layout::getHardwareIndex(const size_t prog) const {
+  assert(prog < programToHardware_.size() && "program index out of bounds");
+  return programToHardware_[prog];
+}
+
+void Layout::swap(const size_t hwA, const size_t hwB) {
+  assert(hwA < hardwareToProgram_.size() && "hardware index out of bounds");
+  assert(hwB < hardwareToProgram_.size() && "hardware index out of bounds");
+  const auto progA = hardwareToProgram_[hwA];
+  const auto progB = hardwareToProgram_[hwB];
+
+  std::swap(hardwareToProgram_[hwA], hardwareToProgram_[hwB]);
+  std::swap(programToHardware_[progA], programToHardware_[progB]);
+}
+
+size_t Layout::nqubits() const { return programToHardware_.size(); }
+
+ArrayRef<size_t> Layout::getProgramToHardware() const {
+  return programToHardware_;
+}
+
+} // namespace mlir::qco
diff --git a/mlir/lib/Dialect/QCO/Utils/Qubits.cpp b/mlir/lib/Dialect/QCO/Utils/Qubits.cpp
deleted file mode 100644
index c01187f625..0000000000
--- a/mlir/lib/Dialect/QCO/Utils/Qubits.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2023 - 2026 Chair for Design Automation, TUM
- * Copyright (c) 2025 - 2026 Munich Quantum Software Company GmbH
- * All rights reserved.
- *
- * SPDX-License-Identifier: MIT
- *
- * Licensed under the MIT License
- */
-
-#include "mlir/Dialect/QCO/Utils/Qubits.h"
-
-#include "mlir/Dialect/QCO/IR/QCODialect.h"
-
-#include <mlir/IR/Value.h>
-
-#include <cassert>
-#include <cstddef>
-#include <utility>
-
-namespace mlir::qco {
-void Qubits::add(TypedValue<QubitType> q) {
-  const auto index = programToValue_.size();
-  programToValue_.try_emplace(index, q);
-  valueToIndex_.try_emplace(q, std::make_pair(QubitLocation::Program, index));
-}
-
-void Qubits::add(TypedValue<QubitType> q, std::size_t hw) {
-  hardwareToValue_.try_emplace(hw, q);
-  valueToIndex_.try_emplace(q, std::make_pair(QubitLocation::Hardware, hw));
-}
-
-void Qubits::remap(TypedValue<QubitType> prev, TypedValue<QubitType> next) {
-  assert(valueToIndex_.contains(prev));
-  const auto& [location, index] = valueToIndex_.lookup(prev);
-
-  valueToIndex_.erase(prev);
-  valueToIndex_.try_emplace(next, std::make_pair(location, index));
-
-  if (location == QubitLocation::Program) {
-    programToValue_[index] = next;
-    return;
-  }
-
-  hardwareToValue_[index] = next;
-}
-
-void Qubits::remove(TypedValue<QubitType> q) {
-  assert(valueToIndex_.contains(q));
-  const auto& [location, index] = valueToIndex_.lookup(q);
-
-  valueToIndex_.erase(q);
-
-  if (location == QubitLocation::Program) {
-    programToValue_.erase(index);
-    return;
-  }
-
-  hardwareToValue_.erase(index);
-}
-
-TypedValue<QubitType> Qubits::getProgramQubit(std::size_t index) const {
-  assert(programToValue_.contains(index));
-  return programToValue_.lookup(index);
-}
-
-TypedValue<QubitType> Qubits::getHardwareQubit(std::size_t index) const {
-  assert(hardwareToValue_.contains(index));
-  return hardwareToValue_.lookup(index);
-}
-
-std::size_t Qubits::getIndex(TypedValue<QubitType> q) const {
-  assert(valueToIndex_.contains(q));
-  const auto& res = valueToIndex_.lookup(q);
-  return res.second;
-}
-} // namespace mlir::qco
diff --git a/mlir/lib/Dialect/QTensor/Utils/TensorIterator.cpp b/mlir/lib/Dialect/QTensor/Utils/TensorIterator.cpp
index 064cdbd624..6af8c5bff7 100644
--- a/mlir/lib/Dialect/QTensor/Utils/TensorIterator.cpp
+++ b/mlir/lib/Dialect/QTensor/Utils/TensorIterator.cpp
@@ -26,12 +26,8 @@
 
 namespace mlir::qtensor {
 TypedValue<RankedTensorType> TensorIterator::tensor() const {
-  if (op_ == nullptr) {
-    return tensor_;
-  }
-
   // The following operations don't have an OpResult.
-  if (isa<DeallocOp, scf::YieldOp, qco::YieldOp>(op_)) {
+  if (op_ != nullptr && isa<DeallocOp, scf::YieldOp, qco::YieldOp>(op_)) {
     return nullptr;
   }
 
diff --git a/mlir/unittests/Dialect/QCO/Transforms/Mapping/test_mapping.cpp b/mlir/unittests/Dialect/QCO/Transforms/Mapping/test_mapping.cpp
index f209d23716..3888966a70 100644
--- a/mlir/unittests/Dialect/QCO/Transforms/Mapping/test_mapping.cpp
+++ b/mlir/unittests/Dialect/QCO/Transforms/Mapping/test_mapping.cpp
@@ -14,14 +14,15 @@
 #include "mlir/Dialect/QCO/IR/QCOOps.h"
 #include "mlir/Dialect/QCO/Transforms/Mapping/Mapping.h"
 #include "mlir/Dialect/QCO/Transforms/Passes.h"
-#include "mlir/Dialect/QCO/Utils/Algorithms.h"
-#include "mlir/Dialect/QCO/Utils/Drivers.h"
-#include "mlir/Dialect/QCO/Utils/Qubits.h"
 
 #include <gtest/gtest.h>
+#include <llvm/ADT/STLExtras.h>
+#include <llvm/ADT/TypeSwitch.h>
+#include <llvm/Support/Debug.h>
 #include <llvm/Support/LogicalResult.h>
 #include <mlir/Dialect/Arith/IR/Arith.h>
 #include <mlir/Dialect/Func/IR/FuncOps.h>
+#include <mlir/Dialect/SCF/IR/SCF.h>
 #include <mlir/IR/BuiltinOps.h>
 #include <mlir/IR/DialectRegistry.h>
 #include <mlir/IR/Location.h>
@@ -29,7 +30,7 @@
 #include <mlir/IR/Value.h>
 #include <mlir/Pass/PassManager.h>
 #include <mlir/Support/LLVM.h>
-#include <mlir/Support/WalkResult.h>
+#include <mlir/Transforms/Passes.h>
 
 #include <cassert>
 #include <cstddef>
@@ -41,66 +42,134 @@
 using namespace mlir;
 using namespace mlir::qco;
 
-using DeviceSpec = std::pair<size_t, Edges>;
-
-/**
- * @returns llvm::success() if all two-qubit gates inside @p region
- * fulfill the given coupling constraints. llvm::failure(), otherwise.
- */
-static LogicalResult isExecutable(Region& region, const Edges& coupling) {
-  return walkProgram(region, [&](Operation* curr, const Qubits& qubits) {
-    if (auto op = dyn_cast<UnitaryOpInterface>(curr)) {
-      if (isa<BarrierOp>(op)) {
-        return WalkResult::advance();
-      }
-
-      assert(op.getNumQubits() <= 2 &&
-             "isExecutable: expected two-qubit gate decomposition");
-
-      if (op.getNumQubits() > 1) {
-        const auto q0 = cast<TypedValue<QubitType>>(op.getInputQubit(0));
-        const auto q1 = cast<TypedValue<QubitType>>(op.getInputQubit(1));
-        const auto i0 = qubits.getIndex(q0);
-        const auto i1 = qubits.getIndex(q1);
-
-        if (!coupling.contains(std::make_pair(i0, i1))) {
-          return WalkResult::interrupt();
-        }
-      }
+namespace {
+struct Device {
+  size_t nqubits{};
+  DenseSet<std::pair<size_t, size_t>> couplingSet;
+};
+} // namespace
+
+/// Return true, if the operations within a region fulfill the given coupling
+/// constraints.
+static bool
+isExecutable(Region& body, DenseMap<Value, size_t>& m,
+             const DenseSet<std::pair<size_t, size_t>>& couplingSet) {
+  for (Operation& rop : body.getOps()) {
+    bool executable = true;
+    TypeSwitch<Operation*>(&rop)
+        .Case<StaticOp>(
+            [&](StaticOp op) { m.try_emplace(op.getQubit(), op.getIndex()); })
+        .Case<BarrierOp>([&](BarrierOp op) {
+          for (const auto [pred, succ] :
+               llvm::zip_equal(op.getInputQubits(), op.getOutputQubits())) {
+            const auto hw = m.at(pred);
+            m.try_emplace(succ, hw);
+          }
+        })
+        .Case<UnitaryOpInterface>([&](UnitaryOpInterface& op) {
+          assert(op.getNumQubits() <= 2 && "expected two-qubit decomp.");
+
+          if (op.getNumQubits() > 1) {
+            const auto hwA = m.at(op.getInputQubit(0));
+            const auto hwB = m.at(op.getInputQubit(1));
+            if (!couplingSet.contains(std::make_pair(hwA, hwB))) {
+              llvm::dbgs() << "(" << hwA << ", " << hwB << ") "
+                           << "not executable: \n";
+              op->dump();
+              executable = false;
+            }
+          }
+
+          for (const auto [pred, succ] :
+               llvm::zip_equal(op.getInputQubits(), op.getOutputQubits())) {
+            const auto hw = m.at(pred);
+            m.try_emplace(succ, hw);
+          }
+        })
+        .Case<scf::ForOp>([&](scf::ForOp op) {
+          DenseMap<Value, size_t> loopM;
+          for (const auto [init, arg] :
+               llvm::zip_equal(op.getInits(), op.getRegionIterArgs())) {
+            const auto hw = m.at(init);
+            loopM.try_emplace(arg, hw);
+          }
+
+          for (OpOperand& operand : op.getInitsMutable()) {
+            const auto pred = operand.get();
+            const auto succ = op.getTiedLoopResult(&operand);
+            const auto hw = m.at(pred);
+            m.try_emplace(succ, hw);
+          }
+
+          if (!isExecutable(op.getRegion(), loopM, couplingSet)) {
+            executable = false;
+            return;
+          }
+
+          for (const auto& [arg, yielded] :
+               llvm::zip_equal(op.getRegionIterArgs(), op.getYieldedValues())) {
+            if (loopM.at(arg) != loopM.at(yielded)) {
+              llvm::dbgs() << "for loop layout not restored!\n";
+              executable = false;
+              return;
+            }
+          }
+        })
+        .Case<scf::YieldOp>([&](scf::YieldOp op) {
+          assert(isa<scf::ForOp>(op->getParentOp()));
+          auto forOp = cast<scf::ForOp>(op->getParentOp());
+        })
+        .Case<ResetOp, MeasureOp>([&](auto op) {
+          const auto pred = op.getQubitIn();
+          const auto succ = op.getQubitOut();
+          const auto hw = m.at(pred);
+          m.try_emplace(succ, hw);
+        });
+
+    if (!executable) {
+      return false;
     }
+  }
 
-    return WalkResult::advance();
-  });
+  return true;
 }
 
-/**
- * @returns a 9x9 square-grid device.
- */
-static DeviceSpec getNineQubitSquareGrid() {
-  const static Edges COUPLING{{0, 3}, {3, 0}, {0, 1}, {1, 0}, {1, 4}, {4, 1},
-                              {1, 2}, {2, 1}, {2, 5}, {5, 2}, {3, 6}, {6, 3},
-                              {3, 4}, {4, 3}, {4, 7}, {7, 4}, {4, 5}, {5, 4},
-                              {5, 8}, {8, 5}, {6, 7}, {7, 6}, {7, 8}, {8, 7}};
-  return std::make_pair(9, COUPLING);
+/// Return true, if the entry point fulfills the given coupling constraints.
+static bool
+isExecutable(func::FuncOp entry,
+             const DenseSet<std::pair<size_t, size_t>>& couplingSet) {
+  DenseMap<Value, size_t> m;
+  return isExecutable(entry.getFunctionBody(), m, couplingSet);
+}
+
+/// Return a 9x9 square-grid coupling set.
+static Device getNineQubitSquareGrid() {
+  return {.nqubits = 9,
+          .couplingSet = {{0, 3}, {3, 0}, {0, 1}, {1, 0}, {1, 4}, {4, 1},
+                          {1, 2}, {2, 1}, {2, 5}, {5, 2}, {3, 6}, {6, 3},
+                          {3, 4}, {4, 3}, {4, 7}, {7, 4}, {4, 5}, {5, 4},
+                          {5, 8}, {8, 5}, {6, 7}, {7, 6}, {7, 8}, {8, 7}}};
 }
 
 namespace {
 
 class MappingPassTest : public testing::Test,
-                        public testing::WithParamInterface<DeviceSpec> {
+                        public testing::WithParamInterface<Device> {
 protected:
   void SetUp() override {
     DialectRegistry registry;
-    registry.insert<QCODialect, arith::ArithDialect, func::FuncDialect>();
+    registry.insert<QCODialect, scf::SCFDialect, arith::ArithDialect,
+                    func::FuncDialect>();
     context = std::make_unique<MLIRContext>();
     context->appendDialectRegistry(registry);
     context->loadAllAvailableDialects();
   }
 
-  static LogicalResult runPass(ModuleOp m, const DeviceSpec& device,
-                               const MappingPassOptions& options) {
+  static LogicalResult
+  runPass(ModuleOp m, const DenseSet<std::pair<size_t, size_t>>& couplingSet,
+          const MappingPassOptions& options) {
     PassManager pm(m->getContext());
-    pm.addPass(createMappingPass(device.first, device.second, options));
+    pm.addPass(createMappingPass(couplingSet, options));
     return pm.run(m);
   }
 
@@ -113,9 +182,7 @@ TEST_P(MappingPassTest, NoEntryPoint) {
   const auto& device = GetParam();
 
   OwningOpRef m = ModuleOp::create(UnknownLoc::get(context.get()));
-
-  auto res = runPass(m.get(), device, MappingPassOptions{});
-
+  auto res = runPass(m.get(), device.couplingSet, MappingPassOptions{});
   ASSERT_TRUE(res.failed());
 }
 
@@ -130,7 +197,7 @@ TEST_P(MappingPassTest, NoQubitAllocations) {
   builder.sink(q0);
 
   auto m = builder.finalize();
-  auto res = runPass(m.get(), device, MappingPassOptions{});
+  auto res = runPass(m.get(), device.couplingSet, MappingPassOptions{});
 
   ASSERT_TRUE(res.failed());
 }
@@ -155,35 +222,35 @@ TEST_P(MappingPassTest, NoExtractAfterInsert) {
   builder.qtensorDealloc(tensor0);
 
   auto m = builder.finalize();
-  auto res = runPass(m.get(), device, MappingPassOptions{});
+  auto res = runPass(m.get(), device.couplingSet, MappingPassOptions{});
 
   ASSERT_TRUE(res.failed());
 }
 
 TEST_P(MappingPassTest, TooManyQubitsForArch) {
   const auto& device = GetParam();
+  const auto n = static_cast<int64_t>(device.nqubits) + 1;
 
   QCOProgramBuilder builder(context.get());
   builder.initialize();
 
-  int64_t nqubits = static_cast<int64_t>(device.first) + 1;
-  Value tensor = builder.qtensorAlloc(nqubits);
-  SmallVector<Value> qubits(nqubits);
-  for (int64_t i = 0; i < nqubits; ++i) {
+  Value tensor = builder.qtensorAlloc(n);
+  SmallVector<Value> qubits(n);
+  for (int64_t i = 0; i < n; ++i) {
     Value qi;
     std::tie(tensor, qi) = builder.qtensorExtract(tensor, i);
     qi = builder.h(qi);
     qubits[i] = qi;
   }
 
-  for (int64_t i = 0; i < nqubits; ++i) {
+  for (int64_t i = 0; i < n; ++i) {
     tensor = builder.qtensorInsert(qubits[i], tensor, i);
   }
 
   builder.qtensorDealloc(tensor);
 
   auto m = builder.finalize();
-  auto res = runPass(m.get(), device, MappingPassOptions{});
+  auto res = runPass(m.get(), device.couplingSet, MappingPassOptions{});
 
   ASSERT_TRUE(res.failed());
 }
@@ -215,11 +282,222 @@ TEST_P(MappingPassTest, GHZ) {
   builder.qtensorDealloc(tensor);
 
   auto m = builder.finalize();
-  auto res = runPass(m.get(), device, MappingPassOptions{});
+  auto res = runPass(m.get(), device.couplingSet, MappingPassOptions{});
+  auto entry = getEntryPoint(m.get());
+
+  ASSERT_TRUE(res.succeeded());
+  EXPECT_TRUE(isExecutable(entry, device.couplingSet));
+}
+
+TEST_P(MappingPassTest, GHZUnrolled) {
+  const auto& device = GetParam();
+  const auto n = static_cast<int64_t>(device.nqubits);
+
+  PassManager pm(context.get());
+  pm.addNestedPass<func::FuncOp>(createQuantumLoopUnroll());
+  pm.addPass(createCSEPass());
+  pm.addPass(createCanonicalizerPass());
+  pm.addPass(createMappingPass(device.couplingSet, MappingPassOptions{}));
+
+  QCOProgramBuilder builder(context.get());
+  builder.initialize();
+
+  Value tensor = builder.qtensorAlloc(n);
+  Value q0;
+  std::tie(tensor, q0) = builder.qtensorExtract(tensor, 0);
+  q0 = builder.h(q0);
+  tensor = builder.qtensorInsert(q0, tensor, 0);
+  tensor = builder.scfFor(
+      1, n, 1, {tensor}, [&builder](Value iv, ValueRange iterArgs) {
+        Value loopTensor = iterArgs[0];
+        Value ctrl;
+        Value targ;
+
+        std::tie(loopTensor, ctrl) = builder.qtensorExtract(loopTensor, 0);
+        std::tie(loopTensor, targ) = builder.qtensorExtract(loopTensor, iv);
+
+        std::tie(ctrl, targ) = builder.cx(ctrl, targ);
+
+        loopTensor = builder.qtensorInsert(ctrl, loopTensor, 0);
+        loopTensor = builder.qtensorInsert(targ, loopTensor, iv);
+
+        return SmallVector{loopTensor};
+      })[0];
+  builder.qtensorDealloc(tensor);
+
+  auto m = builder.finalize();
+  auto res = pm.run(m.get());
+  auto entry = getEntryPoint(m.get());
+
+  ASSERT_TRUE(res.succeeded());
+  EXPECT_TRUE(isExecutable(entry, device.couplingSet));
+}
+
+TEST_P(MappingPassTest, GroverLike) {
+  const auto& device = GetParam();
+
+  PassManager pm(context.get());
+  pm.addPass(createMappingPass(device.couplingSet, MappingPassOptions{}));
+
+  QCOProgramBuilder builder(context.get());
+  builder.initialize();
+
+  Value tensor = builder.qtensorAlloc(4);
+  Value flagTensor = builder.qtensorAlloc(1);
+  Value q0;
+  Value q1;
+  Value q2;
+  Value q3;
+  Value flag;
+
+  std::tie(tensor, q0) = builder.qtensorExtract(tensor, 0);
+  std::tie(tensor, q1) = builder.qtensorExtract(tensor, 1);
+  std::tie(tensor, q2) = builder.qtensorExtract(tensor, 2);
+  std::tie(tensor, q3) = builder.qtensorExtract(tensor, 3);
+  std::tie(flagTensor, flag) = builder.qtensorExtract(flagTensor, 0);
+
+  q0 = builder.h(q0);
+  q1 = builder.h(q1);
+  q2 = builder.h(q2);
+  q3 = builder.h(q3);
+  flag = builder.x(flag);
+
+  const auto forResults = builder.scfFor(
+      1, 3, 1, {q0, q1, q2, q3, flag}, [&builder](Value, ValueRange iterArgs) {
+        Value iterQ0 = iterArgs[0];
+        Value iterQ1 = iterArgs[1];
+        Value iterQ2 = iterArgs[2];
+        Value iterQ3 = iterArgs[3];
+        Value iterFlag = iterArgs[4];
+
+        std::tie(iterQ0, iterQ2) = builder.cx(iterQ0, iterQ2);
+        std::tie(iterQ2, iterQ3) = builder.cx(iterQ2, iterQ3);
+        std::tie(iterQ3, iterQ0) = builder.cx(iterQ3, iterQ0);
+        std::tie(iterQ0, iterFlag) = builder.cx(iterQ0, iterFlag);
+
+        return SmallVector{iterQ0, iterQ1, iterQ2, iterQ3, iterFlag};
+      });
+
+  q0 = forResults[0];
+  q1 = forResults[1];
+  q2 = forResults[2];
+  q3 = forResults[3];
+  flag = forResults[4];
+
+  const auto barrierResults = builder.barrier({q0, q1, q2, q3, flag});
+  q0 = barrierResults[0];
+  q1 = barrierResults[1];
+  q2 = barrierResults[2];
+  q3 = barrierResults[3];
+  flag = barrierResults[4];
+
+  Value c0;
+  Value c1;
+  Value c2;
+  Value c3;
+  Value c4;
+
+  std::tie(q0, c0) = builder.measure(q0);
+  std::tie(q1, c1) = builder.measure(q1);
+  std::tie(q2, c2) = builder.measure(q2);
+  std::tie(q3, c3) = builder.measure(q3);
+  std::tie(flag, c4) = builder.measure(flag);
+
+  tensor = builder.qtensorInsert(q0, tensor, 0);
+  tensor = builder.qtensorInsert(q1, tensor, 1);
+  tensor = builder.qtensorInsert(q2, tensor, 2);
+  tensor = builder.qtensorInsert(q3, tensor, 3);
+  flagTensor = builder.qtensorInsert(flag, flagTensor, 0);
+
+  builder.qtensorDealloc(tensor);
+  builder.qtensorDealloc(flagTensor);
+
+  auto m = builder.finalize();
+  auto res = pm.run(m.get());
+  auto entry = getEntryPoint(m.get());
+
+  ASSERT_TRUE(res.succeeded());
+  EXPECT_TRUE(isExecutable(entry, device.couplingSet));
+}
+
+TEST_P(MappingPassTest, ParallelLoops) {
+  constexpr int64_t nqubits = 6;
+  const auto& device = GetParam();
+
+  PassManager pm(context.get());
+  pm.addPass(createMappingPass(device.couplingSet, MappingPassOptions{}));
+
+  QCOProgramBuilder builder(context.get());
+  builder.initialize();
+
+  Value tensor = builder.qtensorAlloc(nqubits);
+  SmallVector<Value> creg(nqubits);
+  SmallVector<Value> qreg(nqubits);
+
+  for (int64_t i = 0; i < nqubits; ++i) {
+    std::tie(tensor, qreg[i]) = builder.qtensorExtract(tensor, i);
+    qreg[i] = builder.h(qreg[i]);
+  }
+
+  const auto upForResults =
+      builder.scfFor(1, 3, 1, {qreg[0], qreg[1], qreg[2]},
+                     [&builder](Value, ValueRange iterArgs) {
+                       Value iterQ0 = iterArgs[0];
+                       Value iterQ1 = iterArgs[1];
+                       Value iterQ2 = iterArgs[2];
+
+                       std::tie(iterQ0, iterQ1) = builder.cx(iterQ0, iterQ1);
+                       iterQ0 = builder.h(iterQ0);
+                       std::tie(iterQ0, iterQ1) = builder.cz(iterQ0, iterQ1);
+                       std::tie(iterQ1, iterQ2) = builder.cz(iterQ1, iterQ2);
+                       std::tie(iterQ0, iterQ2) = builder.cx(iterQ0, iterQ2);
+
+                       return SmallVector{iterQ0, iterQ1, iterQ2};
+                     });
+
+  qreg[0] = upForResults[0];
+  qreg[1] = upForResults[1];
+  qreg[2] = upForResults[2];
+
+  const auto downForResults =
+      builder.scfFor(1, 3, 1, {qreg[3], qreg[4], qreg[5]},
+                     [&builder](Value, ValueRange iterArgs) {
+                       Value iterQ0 = iterArgs[0];
+                       Value iterQ1 = iterArgs[1];
+                       Value iterQ2 = iterArgs[2];
+
+                       std::tie(iterQ0, iterQ1) = builder.cx(iterQ0, iterQ1);
+                       iterQ0 = builder.h(iterQ0);
+                       std::tie(iterQ1, iterQ2) = builder.cz(iterQ1, iterQ2);
+                       std::tie(iterQ0, iterQ1) = builder.cz(iterQ0, iterQ1);
+                       std::tie(iterQ0, iterQ2) = builder.cx(iterQ0, iterQ2);
+
+                       return SmallVector{iterQ0, iterQ1, iterQ2};
+                     });
+
+  qreg[3] = downForResults[0];
+  qreg[4] = downForResults[1];
+  qreg[5] = downForResults[2];
+
+  qreg = builder.barrier(qreg);
+
+  for (int64_t i = 0; i < nqubits; ++i) {
+    std::tie(qreg[i], creg[i]) = builder.measure(qreg[i]);
+    qreg[i] = builder.h(qreg[i]);
+  }
+
+  for (int64_t i = 0; i < nqubits; ++i) {
+    tensor = builder.qtensorInsert(qreg[i], tensor, i);
+  }
+
+  builder.qtensorDealloc(tensor);
+
+  auto m = builder.finalize();
+  auto res = pm.run(m.get());
   auto entry = getEntryPoint(m.get());
 
   ASSERT_TRUE(res.succeeded());
-  EXPECT_TRUE(isExecutable(entry.getFunctionBody(), device.second).succeeded());
+  EXPECT_TRUE(isExecutable(entry, device.couplingSet));
 }
 
 TEST_P(MappingPassTest, Sabre) {
@@ -309,11 +587,11 @@ TEST_P(MappingPassTest, Sabre) {
   builder.qtensorDealloc(tensorDown);
 
   auto m = builder.finalize();
-  auto res = runPass(m.get(), device, MappingPassOptions{});
+  auto res = runPass(m.get(), device.couplingSet, MappingPassOptions{});
   auto entry = getEntryPoint(m.get());
 
   ASSERT_TRUE(res.succeeded());
-  EXPECT_TRUE(isExecutable(entry.getFunctionBody(), device.second).succeeded());
+  EXPECT_TRUE(isExecutable(entry, device.couplingSet));
 }
 
 INSTANTIATE_TEST_SUITE_P(NineQubitSquareGrid, MappingPassTest,
diff --git a/mlir/unittests/Dialect/QCO/Utils/test_drivers.cpp b/mlir/unittests/Dialect/QCO/Utils/test_drivers.cpp
index c9f84fefd0..7bbc2437dd 100644
--- a/mlir/unittests/Dialect/QCO/Utils/test_drivers.cpp
+++ b/mlir/unittests/Dialect/QCO/Utils/test_drivers.cpp
@@ -12,7 +12,6 @@
 #include "mlir/Dialect/QCO/IR/QCODialect.h"
 #include "mlir/Dialect/QCO/IR/QCOOps.h"
 #include "mlir/Dialect/QCO/Utils/Drivers.h"
-#include "mlir/Dialect/QCO/Utils/Qubits.h"
 #include "mlir/Dialect/QCO/Utils/WireIterator.h"
 
 #include <gtest/gtest.h>
@@ -50,58 +49,6 @@ class DriversTest : public testing::Test {
 };
 } // namespace
 
-TEST_F(DriversTest, ProgramWalk) {
-  qco::QCOProgramBuilder builder(context.get());
-  builder.initialize();
-  const auto q00 = builder.allocQubit();
-  const auto q10 = builder.allocQubit();
-  const auto q20 = builder.allocQubit();
-  const auto q30 = builder.allocQubit();
-
-  const auto q01 = builder.h(q00);
-  const auto [q02, q11] = builder.cx(q01, q10);
-  const auto [q21, q31] = builder.cx(q20, q30);
-
-  const auto [q03, c0] = builder.measure(q02);
-  const auto [q12, c1] = builder.measure(q11);
-  const auto [q22, c2] = builder.measure(q21);
-  const auto [q32, c3] = builder.measure(q31);
-
-  builder.sink(q03);
-  builder.sink(q12);
-  builder.sink(q22);
-  builder.sink(q32);
-
-  auto mod = builder.finalize();
-  auto func = *(mod->getOps<func::FuncOp>().begin());
-
-  Value ex0 = nullptr;
-  Value ex1 = nullptr;
-  Value ex2 = nullptr;
-  Value ex3 = nullptr;
-
-  // Walk until the first measurement operation is encountered and stop.
-  // Since WalkOrder::PreOrder is used here, the state of the qubits is not yet
-  // updated with the SSA values of the measurement op.
-  // Consequently, the program qubits point at the outputs of the controlled-Xs.
-  std::ignore = qco::walkProgram(func.getBody(),
-                                 [&](Operation* op, const qco::Qubits& qubits) {
-                                   if (op == q03.getDefiningOp()) {
-                                     ex0 = qubits.getProgramQubit(0);
-                                     ex1 = qubits.getProgramQubit(1);
-                                     ex2 = qubits.getProgramQubit(2);
-                                     ex3 = qubits.getProgramQubit(3);
-                                     return WalkResult::interrupt();
-                                   }
-                                   return WalkResult::advance();
-                                 });
-
-  ASSERT_EQ(ex0, q02);
-  ASSERT_EQ(ex1, q11);
-  ASSERT_EQ(ex2, q21);
-  ASSERT_EQ(ex3, q31);
-}
-
 TEST_F(DriversTest, ProgramGraphWalkTooFewWires) {
   qco::QCOProgramBuilder builder(context.get());
   builder.initialize();