diff --git a/cpp/demo/biharmonic/main.cpp b/cpp/demo/biharmonic/main.cpp
index 77277910836..88644089380 100644
--- a/cpp/demo/biharmonic/main.cpp
+++ b/cpp/demo/biharmonic/main.cpp
@@ -157,7 +157,7 @@ int main(int argc, char* argv[])
   PetscInitialize(&argc, &argv, nullptr, nullptr);
   {
     //  Create mesh
-    auto part = mesh::create_cell_partitioner(mesh::GhostMode::shared_facet);
+    auto part = mesh::create_cell_partitioner(mesh::GhostMode::shared_facet, 2);
     auto mesh = std::make_shared<mesh::Mesh<U>>(
         mesh::create_rectangle<U>(MPI_COMM_WORLD, {{{0.0, 0.0}, {1.0, 1.0}}},
                                   {32, 32}, mesh::CellType::triangle, part));
diff --git a/cpp/demo/codim_0_assembly/main.cpp b/cpp/demo/codim_0_assembly/main.cpp
index 5be7086e5e2..caeb81afbf0 100644
--- a/cpp/demo/codim_0_assembly/main.cpp
+++ b/cpp/demo/codim_0_assembly/main.cpp
@@ -35,7 +35,7 @@ int main(int argc, char* argv[])
     auto mesh = std::make_shared<mesh::Mesh<U>>(mesh::create_rectangle<U>(
         MPI_COMM_WORLD, {{{0.0, 0.0}, {2.0, 1.0}}}, {1, 4},
         mesh::CellType::quadrilateral,
-        mesh::create_cell_partitioner(mesh::GhostMode::shared_facet)));
+        mesh::create_cell_partitioner(mesh::GhostMode::shared_facet, 2)));
 
     basix::FiniteElement element = basix::create_element<U>(
         basix::element::family::P, basix::cell::type::quadrilateral, 1,
diff --git a/cpp/demo/hyperelasticity/main.cpp b/cpp/demo/hyperelasticity/main.cpp
index df95ae8b51b..2978e7dc58c 100644
--- a/cpp/demo/hyperelasticity/main.cpp
+++ b/cpp/demo/hyperelasticity/main.cpp
@@ -159,7 +159,7 @@ int main(int argc, char* argv[])
     auto mesh = std::make_shared<mesh::Mesh<U>>(mesh::create_box<U>(
         MPI_COMM_WORLD, {{{0.0, 0.0, 0.0}, {1.0, 1.0, 1.0}}}, {10, 10, 10},
         mesh::CellType::tetrahedron,
-        mesh::create_cell_partitioner(mesh::GhostMode::none)));
+        mesh::create_cell_partitioner(mesh::GhostMode::none, 2)));
 
     auto element = basix::create_element<U>(
         basix::element::family::P, basix::cell::type::tetrahedron, 1,
diff --git a/cpp/demo/interpolation-io/main.cpp b/cpp/demo/interpolation-io/main.cpp
index 2fe2625e0c4..50666a56c7e 100644
--- a/cpp/demo/interpolation-io/main.cpp
+++ b/cpp/demo/interpolation-io/main.cpp
@@ -209,7 +209,7 @@ int main(int argc, char* argv[])
         = std::make_shared<mesh::Mesh<float>>(mesh::create_rectangle<float>(
             MPI_COMM_WORLD, {{{0.0, 0.0}, {1.0, 1.0}}}, {32, 4},
             mesh::CellType::triangle,
-            mesh::create_cell_partitioner(mesh::GhostMode::none)));
+            mesh::create_cell_partitioner(mesh::GhostMode::none, 2)));
 
     // Create mesh using same topology as mesh0, but with different
     // scalar type for geometry
@@ -217,7 +217,7 @@ int main(int argc, char* argv[])
         = std::make_shared<mesh::Mesh<double>>(mesh::create_rectangle<double>(
             MPI_COMM_WORLD, {{{0.0, 0.0}, {1.0, 1.0}}}, {32, 4},
             mesh::CellType::triangle,
-            mesh::create_cell_partitioner(mesh::GhostMode::none)));
+            mesh::create_cell_partitioner(mesh::GhostMode::none, 2)));
 
     // Interpolate a function in a scalar Lagrange space and output the
     // result to file for visualisation using different types
diff --git a/cpp/demo/poisson/main.cpp b/cpp/demo/poisson/main.cpp
index 2684403557f..e33a3aba712 100644
--- a/cpp/demo/poisson/main.cpp
+++ b/cpp/demo/poisson/main.cpp
@@ -111,7 +111,7 @@ int main(int argc, char* argv[])
 
   {
     // Create mesh and function space
-    auto part = mesh::create_cell_partitioner(mesh::GhostMode::shared_facet);
+    auto part = mesh::create_cell_partitioner(mesh::GhostMode::shared_facet, 2);
     auto mesh = std::make_shared<mesh::Mesh<U>>(
         mesh::create_rectangle<U>(MPI_COMM_WORLD, {{{0.0, 0.0}, {2.0, 1.0}}},
                                   {32, 16}, mesh::CellType::triangle, part));
diff --git a/cpp/demo/poisson_matrix_free/main.cpp b/cpp/demo/poisson_matrix_free/main.cpp
index 7e199f32c00..2181d956bcd 100644
--- a/cpp/demo/poisson_matrix_free/main.cpp
+++ b/cpp/demo/poisson_matrix_free/main.cpp
@@ -138,7 +138,7 @@ void solver(MPI_Comm comm)
   // Create mesh and function space
   auto mesh = std::make_shared<mesh::Mesh<U>>(mesh::create_rectangle<U>(
       comm, {{{0.0, 0.0}, {1.0, 1.0}}}, {10, 10}, mesh::CellType::triangle,
-      mesh::create_cell_partitioner(mesh::GhostMode::none)));
+      mesh::create_cell_partitioner(mesh::GhostMode::none, 2)));
   auto element = basix::create_element<U>(
       basix::element::family::P, basix::cell::type::triangle, 2,
       basix::element::lagrange_variant::unset,
diff --git a/cpp/dolfinx/common/sort.h b/cpp/dolfinx/common/sort.h
index 3dd59aeb9d0..b2faf137be9 100644
--- a/cpp/dolfinx/common/sort.h
+++ b/cpp/dolfinx/common/sort.h
@@ -174,17 +174,17 @@ constexpr void radix_sort(R&& range, P proj = {})
 /// @note This function is suitable for small values of `shape1`. Each
 /// column of `x` is copied into an array that is then sorted.
 template <typename T, int BITS = 16>
-std::vector<std::int32_t> sort_by_perm(std::span<const T> x, std::size_t shape1)
+std::vector<std::size_t> sort_by_perm(std::span<const T> x, std::size_t shape1)
 {
   static_assert(std::is_integral_v<T>, "Integral required.");
 
   if (x.empty())
-    return std::vector<std::int32_t>{};
+    return std::vector<std::size_t>{};
 
   assert(shape1 > 0);
   assert(x.size() % shape1 == 0);
   const std::size_t shape0 = x.size() / shape1;
-  std::vector<std::int32_t> perm(shape0);
+  std::vector<std::size_t> perm(shape0);
   std::iota(perm.begin(), perm.end(), 0);
 
   // Sort by each column, right to left. Col 0 has the most significant
diff --git a/cpp/dolfinx/io/VTKHDF.h b/cpp/dolfinx/io/VTKHDF.h
index 66c35d152e2..1a7730942ce 100644
--- a/cpp/dolfinx/io/VTKHDF.h
+++ b/cpp/dolfinx/io/VTKHDF.h
@@ -468,11 +468,11 @@ mesh::Mesh<U> read_mesh(MPI_Comm comm, const std::filesystem::path& filename,
 
   auto part = create_cell_partitioner(mesh::GhostMode::none,
                                       dolfinx::graph::partition_graph,
-                                      max_facet_to_cell_links);
+                                      max_facet_to_cell_links, 0);
   std::vector<std::span<const std::int64_t>> cells_span(cells_local.begin(),
                                                         cells_local.end());
   return mesh::create_mesh(comm, comm, cells_span, coordinate_elements, comm,
                            points_pruned, {(std::size_t)x_shape[0], gdim}, part,
-                           max_facet_to_cell_links);
+                           max_facet_to_cell_links, 0);
 }
 } // namespace dolfinx::io::VTKHDF
diff --git a/cpp/dolfinx/mesh/Topology.cpp b/cpp/dolfinx/mesh/Topology.cpp
index f6f0a31624d..0c62e757c79 100644
--- a/cpp/dolfinx/mesh/Topology.cpp
+++ b/cpp/dolfinx/mesh/Topology.cpp
@@ -10,6 +10,8 @@
 #include "topologycomputation.h"
 #include "utils.h"
 #include <algorithm>
+#include <array>
+#include <boost/sort/sort.hpp>
 #include <dolfinx/common/IndexMap.h>
 #include <dolfinx/common/log.h>
 #include <dolfinx/common/sort.h>
@@ -19,6 +21,7 @@
 #include <numeric>
 #include <random>
 #include <set>
+#include <thread>
 
 using namespace dolfinx;
 using namespace dolfinx::mesh;
@@ -40,7 +43,8 @@ namespace
 /// @return Map from global index to sharing ranks for each index in
 /// indices. The owner rank is the first as the first in the of ranks.
 graph::AdjacencyList<int>
-determine_sharing_ranks(MPI_Comm comm, std::span<const std::int64_t> indices)
+determine_sharing_ranks(MPI_Comm comm, std::span<const std::int64_t> indices,
+                        int num_threads)
 {
   common::Timer timer("Topology: determine shared index ownership");
 
@@ -63,7 +67,13 @@ determine_sharing_ranks(MPI_Comm comm, std::span<const std::int64_t> indices)
       int dest = dolfinx::MPI::index_owner(size, idx, global_range);
       dest_to_index.push_back({dest, static_cast<int>(dest_to_index.size())});
     }
-    std::ranges::sort(dest_to_index);
+    if (num_threads > 0)
+    {
+      boost::sort::block_indirect_sort(dest_to_index.begin(),
+                                       dest_to_index.end(), num_threads);
+    }
+    else
+      std::ranges::sort(dest_to_index);
   }
 
   // Build list of neighbour dest ranks and count number of indices to
@@ -135,10 +145,13 @@ determine_sharing_ranks(MPI_Comm comm, std::span<const std::int64_t> indices)
 
   // Build {global index, pos, src} list
   std::vector<std::array<std::int64_t, 3>> indices_list;
-  for (std::size_t p = 0; p < recv_disp0.size() - 1; ++p)
-    for (std::int32_t i = recv_disp0[p]; i < recv_disp0[p + 1]; ++i)
-      indices_list.push_back({recv_buffer0[i], i, int(p)});
-  std::ranges::sort(indices_list);
+  {
+    common::Timer timer("Topology: XXXXX");
+    for (std::size_t p = 0; p < recv_disp0.size() - 1; ++p)
+      for (std::int32_t i = recv_disp0[p]; i < recv_disp0[p + 1]; ++i)
+        indices_list.push_back({recv_buffer0[i], i, int(p)});
+    std::ranges::sort(indices_list);
+  }
 
   // Find which ranks have each index
   std::vector<std::int32_t> num_items_per_dest1(recv_disp0.size() - 1, 0);
@@ -286,7 +299,7 @@ determine_sharing_ranks(MPI_Comm comm, std::span<const std::int64_t> indices)
 std::array<std::vector<std::int64_t>, 2> vertex_ownership_groups(
     const std::vector<std::span<const std::int64_t>>& cells_owned,
     const std::vector<std::span<const std::int64_t>>& cells_ghost,
-    std::span<const std::int64_t> boundary_vertices)
+    std::span<const std::int64_t> boundary_vertices, int num_threads)
 {
   common::Timer timer("Topology: determine vertex ownership groups (owned, "
                       "undetermined, unowned)");
@@ -300,10 +313,17 @@ std::array<std::vector<std::int64_t>, 2> vertex_ownership_groups(
     local_vertex_set.insert(local_vertex_set.end(), c.begin(), c.end());
 
   {
-    dolfinx::radix_sort(local_vertex_set);
+    if (num_threads > 0)
+    {
+      boost::sort::block_indirect_sort(local_vertex_set.begin(),
+                                       local_vertex_set.end(), 10);
+    }
+    else
+      dolfinx::radix_sort(local_vertex_set);
     auto [unique_end, range_end] = std::ranges::unique(local_vertex_set);
     local_vertex_set.erase(unique_end, range_end);
   }
+
   // Build set of ghost cell vertices (attached to a ghost cell)
   std::vector<std::int64_t> ghost_vertex_set;
   ghost_vertex_set.reserve(
@@ -313,10 +333,19 @@ std::array<std::vector<std::int64_t>, 2> vertex_ownership_groups(
     ghost_vertex_set.insert(ghost_vertex_set.end(), c.begin(), c.end());
 
   {
-    dolfinx::radix_sort(ghost_vertex_set);
+    if (num_threads > 0)
+    {
+
+      boost::sort::block_indirect_sort(ghost_vertex_set.begin(),
+                                       ghost_vertex_set.end(), 10);
+    }
+    else
+      dolfinx::radix_sort(ghost_vertex_set);
+
     auto [unique_end, range_end] = std::ranges::unique(ghost_vertex_set);
     ghost_vertex_set.erase(unique_end, range_end);
   }
+
   // Build difference 1: Vertices attached only to owned cells, and
   // therefore owned by this rank
   std::vector<std::int64_t> owned_vertices;
@@ -329,7 +358,7 @@ std::array<std::vector<std::int64_t>, 2> vertex_ownership_groups(
   std::ranges::set_difference(ghost_vertex_set, local_vertex_set,
                               std::back_inserter(unowned_vertices));
 
-  // TODO Check this in debug mode only?
+#ifndef NDEBUG
   // Sanity check
   // No vertices in unowned should also be in boundary...
   std::vector<std::int64_t> unowned_vertices_in_error;
@@ -341,6 +370,7 @@ std::array<std::vector<std::int64_t>, 2> vertex_ownership_groups(
     throw std::runtime_error(
         "Adding boundary vertices in ghost cells not allowed.");
   }
+#endif
 
   return {std::move(owned_vertices), std::move(unowned_vertices)};
 }
@@ -711,12 +741,12 @@ std::vector<std::int32_t> convert_to_local_indexing(
   std::vector<std::int32_t> data(g.size());
   if (num_threads > 0)
   {
-    std::vector<std::jthread> threads(num_threads);
+    std::vector<std::jthread> threads;
     for (int i = 0; i < num_threads; ++i)
     {
       auto [c0, c1] = dolfinx::MPI::local_range(i, g.size(), num_threads);
-      threads[i] = std::jthread(transform, std::span(data.data() + c0, c1 - c0),
-                                g.subspan(c0, c1 - c0), global_to_local);
+      threads.emplace_back(transform, std::span(data.data() + c0, c1 - c0),
+                           g.subspan(c0, c1 - c0), global_to_local);
     }
   }
   else
@@ -1091,17 +1121,17 @@ Topology mesh::create_topology(
 
   // Create sets of owned and unowned vertices from the cell ownership
   // and the list of boundary vertices
-  auto [owned_vertices, unowned_vertices]
-      = vertex_ownership_groups(owned_cells, ghost_cells, boundary_vertices);
+  auto [owned_vertices, unowned_vertices] = vertex_ownership_groups(
+      owned_cells, ghost_cells, boundary_vertices, num_threads);
+
+  timer1.stop();
+  timer1.flush();
 
   // For each vertex whose ownership needs determining, find the sharing
   // ranks. The first index in the list of ranks for a vertex is the
   // owner (as determined by determine_sharing_ranks).
   const graph::AdjacencyList<int> global_vertex_to_ranks
-      = determine_sharing_ranks(comm, boundary_vertices);
-
-  timer1.stop();
-  timer1.flush();
+      = determine_sharing_ranks(comm, boundary_vertices, num_threads);
 
   // Iterate over vertices that have 'unknown' ownership, and if flagged
   // as owned by determine_sharing_ranks update ownership status
@@ -1136,7 +1166,7 @@ Topology mesh::create_topology(
   // Number all owned vertices, iterating over vertices cell-wise
   std::vector<std::int32_t> local_vertex_indices(owned_vertices.size(), -1);
   {
-    common::Timer timer3("Topology: 3 ");
+    common::Timer timer3("Topology: 3");
     std::int32_t v = 0;
     for (std::span<const std::int64_t> cells_t : cells)
     {
@@ -1153,6 +1183,45 @@ Topology mesh::create_topology(
     }
   }
 
+  // // NEW: Number all owned vertices
+  // // TODO: check if this re-ordering affects geometry retrieval
+  // // performance
+  // std::vector<std::int32_t> local_vertex_indices(owned_vertices.size(), -1);
+  // {
+  //   common::Timer timer3("Topology: 3");
+
+  //   common::Timer timer("Topology: number owned vertices");
+  //   std::int32_t v = 0;
+  //   for (auto _cells : cells)
+  //   {
+  //     std::vector<std::int64_t> vertex_data;
+  //     for (auto vtx : _cells)
+  //       vertex_data.push_back(vtx);
+  //     if (num_threads == 0)
+  //       dolfinx::radix_sort(vertex_data);
+  //     else
+  //     {
+  //       boost::sort::block_indirect_sort(vertex_data.begin(),
+  //       vertex_data.end(),
+  //                                        num_threads);
+  //     }
+
+  //     auto ret = std::ranges::unique(vertex_data);
+  //     auto it0 = owned_vertices.cbegin();
+  //     for (auto it = vertex_data.begin(); it != ret.begin(); ++it)
+  //     {
+  //       if (*it == *it0)
+  //       {
+  //         std::size_t d = std::distance(owned_vertices.cbegin(), it0);
+  //         assert(d < owned_vertices.size());
+  //         assert(local_vertex_indices[d] == -1);
+  //         local_vertex_indices[d] = v++;
+  //         ++it0;
+  //       }
+  //     }
+  //   }
+  // }
+
   common::Timer timer4("Topology: 4");
 
   // Compute the global offset for owned (local) vertex indices
diff --git a/cpp/dolfinx/mesh/generation.h b/cpp/dolfinx/mesh/generation.h
index 11437a882a5..b7890151337 100644
--- a/cpp/dolfinx/mesh/generation.h
+++ b/cpp/dolfinx/mesh/generation.h
@@ -69,7 +69,7 @@ Mesh<T> build_hex(MPI_Comm comm, MPI_Comm subcomm,
                   std::array<std::array<T, 3>, 2> p,
                   std::array<std::int64_t, 3> n,
                   const CellPartitionFunction& partitioner,
-                  const CellReorderFunction& reorder_fn);
+                  const CellReorderFunction& reorder_fn, int num_threads);
 
 template <std::floating_point T>
 Mesh<T> build_prism(MPI_Comm comm, MPI_Comm subcomm,
@@ -117,14 +117,14 @@ Mesh<T> create_box(MPI_Comm comm, MPI_Comm subcomm,
   }
 
   if (!partitioner and dolfinx::MPI::size(comm) > 1)
-    partitioner = create_cell_partitioner(mesh::GhostMode::none, 2);
+    partitioner = create_cell_partitioner(mesh::GhostMode::none, 2, 0);
 
   switch (celltype)
   {
   case CellType::tetrahedron:
     return impl::build_tet<T>(comm, subcomm, p, n, partitioner, reorder_fn);
   case CellType::hexahedron:
-    return impl::build_hex<T>(comm, subcomm, p, n, partitioner, reorder_fn);
+    return impl::build_hex<T>(comm, subcomm, p, n, partitioner, reorder_fn, 0);
   case CellType::prism:
     return impl::build_prism<T>(comm, subcomm, p, n, partitioner, reorder_fn);
   default:
@@ -193,7 +193,7 @@ Mesh<T> create_rectangle(MPI_Comm comm, std::array<std::array<T, 2>, 2> p,
   }
 
   if (!partitioner and dolfinx::MPI::size(comm) > 1)
-    partitioner = create_cell_partitioner(mesh::GhostMode::none, 2);
+    partitioner = create_cell_partitioner(mesh::GhostMode::none, 2, 0);
 
   switch (celltype)
   {
@@ -262,19 +262,19 @@ Mesh<T> create_interval(MPI_Comm comm, std::int64_t n, std::array<T, 2> p,
   }
 
   if (!partitioner and dolfinx::MPI::size(comm) > 1)
-    partitioner = create_cell_partitioner(ghost_mode, 2);
+    partitioner = create_cell_partitioner(ghost_mode, 2, 0);
 
   fem::CoordinateElement<T> element(CellType::interval, 1);
   if (dolfinx::MPI::rank(comm) == 0)
   {
     auto [x, cells] = impl::create_interval_cells<T>(p, n);
     return create_mesh(comm, MPI_COMM_SELF, cells, element, MPI_COMM_SELF, x,
-                       {x.size(), 1}, partitioner, 2, reorder_fn);
+                       {x.size(), 1}, partitioner, 2, 0, reorder_fn);
   }
   else
   {
     return create_mesh(comm, MPI_COMM_NULL, {}, element, MPI_COMM_NULL,
-                       std::vector<T>{}, {0, 1}, partitioner, 2, reorder_fn);
+                       std::vector<T>{}, {0, 1}, partitioner, 2, 0, reorder_fn);
   }
 }
 
@@ -309,6 +309,8 @@ template <std::floating_point T>
 std::vector<T> create_geom(MPI_Comm comm, std::array<std::array<T, 3>, 2> p,
                            std::array<std::int64_t, 3> n)
 {
+  common::Timer timer("Build built-in mesh geometry");
+
   // Extract data
   auto [p0, p1] = p;
   const auto [nx, ny, nz] = n;
@@ -367,6 +369,7 @@ Mesh<T> build_tet(MPI_Comm comm, MPI_Comm subcomm,
   {
     x = create_geom<T>(subcomm, p, n);
 
+    common::Timer timer("Build BoxMesh (tetrahedra): topology");
     const auto [nx, ny, nz] = n;
     const std::int64_t n_cells = nx * ny * nz;
 
@@ -398,7 +401,7 @@ Mesh<T> build_tet(MPI_Comm comm, MPI_Comm subcomm,
   }
 
   return create_mesh(comm, subcomm, cells, element, subcomm, x,
-                     {x.size() / 3, 3}, partitioner, 2, reorder_fn);
+                     {x.size() / 3, 3}, partitioner, 2, 0, reorder_fn);
 }
 
 template <std::floating_point T>
@@ -407,7 +410,8 @@ build_hex(MPI_Comm comm, MPI_Comm subcomm, std::array<std::array<T, 3>, 2> p,
           std::array<std::int64_t, 3> n,
           const CellPartitionFunction& partitioner,
           const std::function<std::vector<std::int32_t>(
-              const graph::AdjacencyList<std::int32_t>&)>& reorder_fn)
+              const graph::AdjacencyList<std::int32_t>&)>& reorder_fn,
+          int num_threads)
 {
   common::Timer timer("Build BoxMesh (hexahedra)");
   std::vector<T> x;
@@ -417,6 +421,8 @@ build_hex(MPI_Comm comm, MPI_Comm subcomm, std::array<std::array<T, 3>, 2> p,
   {
     x = create_geom<T>(subcomm, p, n);
 
+    common::Timer timer("Build BoxMesh (hexahedra): topology");
+
     // Create cuboids
     const auto [nx, ny, nz] = n;
     const std::int64_t n_cells = nx * ny * nz;
@@ -443,7 +449,8 @@ build_hex(MPI_Comm comm, MPI_Comm subcomm, std::array<std::array<T, 3>, 2> p,
   }
 
   return create_mesh(comm, subcomm, cells, element, subcomm, x,
-                     {x.size() / 3, 3}, partitioner, 2, reorder_fn);
+                     {x.size() / 3, 3}, partitioner, 2, num_threads,
+                     reorder_fn);
 }
 
 template <std::floating_point T>
@@ -491,7 +498,7 @@ Mesh<T> build_prism(MPI_Comm comm, MPI_Comm subcomm,
   }
 
   return create_mesh(comm, subcomm, cells, element, subcomm, x,
-                     {x.size() / 3, 3}, partitioner, 2, reorder_fn);
+                     {x.size() / 3, 3}, partitioner, 2, 0, reorder_fn);
 }
 
 template <std::floating_point T>
@@ -641,12 +648,12 @@ Mesh<T> build_tri(MPI_Comm comm, std::array<std::array<T, 2>, 2> p,
     }
 
     return create_mesh(comm, MPI_COMM_SELF, cells, element, MPI_COMM_SELF, x,
-                       {x.size() / 2, 2}, partitioner, 2, reorder_fn);
+                       {x.size() / 2, 2}, partitioner, 2, 0, reorder_fn);
   }
   else
   {
     return create_mesh(comm, MPI_COMM_NULL, {}, element, MPI_COMM_NULL,
-                       std::vector<T>{}, {0, 2}, partitioner, 2, reorder_fn);
+                       std::vector<T>{}, {0, 2}, partitioner, 2, 0, reorder_fn);
   }
 }
 
@@ -690,12 +697,12 @@ Mesh<T> build_quad(MPI_Comm comm, std::array<std::array<T, 2>, 2> p,
     }
 
     return create_mesh(comm, MPI_COMM_SELF, cells, element, MPI_COMM_SELF, x,
-                       {x.size() / 2, 2}, partitioner, 2, reorder_fn);
+                       {x.size() / 2, 2}, partitioner, 2, 0, reorder_fn);
   }
   else
   {
     return create_mesh(comm, MPI_COMM_NULL, {}, element, MPI_COMM_NULL,
-                       std::vector<T>{}, {0, 2}, partitioner, 2, reorder_fn);
+                       std::vector<T>{}, {0, 2}, partitioner, 2, 0, reorder_fn);
   }
 }
 } // namespace impl
diff --git a/cpp/dolfinx/mesh/graphbuild.cpp b/cpp/dolfinx/mesh/graphbuild.cpp
index e0f3e98ccaa..5871677fe1d 100644
--- a/cpp/dolfinx/mesh/graphbuild.cpp
+++ b/cpp/dolfinx/mesh/graphbuild.cpp
@@ -7,6 +7,7 @@
 #include "graphbuild.h"
 #include "cell_types.h"
 #include <algorithm>
+#include <boost/sort/sort.hpp>
 #include <dolfinx/common/MPI.h>
 #include <dolfinx/common/Timer.h>
 #include <dolfinx/common/log.h>
@@ -17,6 +18,7 @@
 #include <optional>
 #include <ranges>
 #include <span>
+#include <thread>
 #include <utility>
 #include <vector>
 
@@ -183,8 +185,8 @@ graph::AdjacencyList<std::int64_t> compute_nonlocal_dual_graph(
     }
     std::ranges::sort(dest_to_index);
 
-    // Build list of dest ranks and count number of items (facets+cell) to
-    // send to each dest post office (by neighbourhood rank)
+    // Build list of dest ranks and count number of items (facets+cell)
+    // to send to each dest post office (by neighbourhood rank)
     for (auto it = dest_to_index.begin(); it != dest_to_index.end();)
     {
       const int neigh_rank = dest.size();
@@ -222,8 +224,7 @@ graph::AdjacencyList<std::int64_t> compute_nonlocal_dual_graph(
                static_cast<double>(dest.size()) / comm_size,
                static_cast<double>(src.size()) / comm_size);
 
-  // Create neighbourhood communicator for sending data to
-  // post offices
+  // Create neighbourhood communicator for sending data to post offices
   MPI_Comm comm_po_post;
   MPI_Dist_graph_create_adjacent(comm, src.size(), src.data(), MPI_UNWEIGHTED,
                                  dest.size(), dest.data(), MPI_UNWEIGHTED,
@@ -286,8 +287,8 @@ graph::AdjacencyList<std::int64_t> compute_nonlocal_dual_graph(
 
   // Search for consecutive facets (-> dual graph edge between cells)
   // and pack into send buffer. We store for every cell the number of
-  // matches, the offsets of each cell and the continuous data.
-  // Note: 'deges' is short for dual edges.
+  // matches, the offsets of each cell and the continuous data. Note:
+  // 'deges' is short for dual edges.
   std::vector<int> dedge_send_count(recv_disp.back());
   std::vector<std::int32_t> dedge_send_displs(dedge_send_count.size() + 1, 0);
   std::vector<std::int64_t> dedge_send_data;
@@ -498,8 +499,9 @@ graph::AdjacencyList<std::int64_t> compute_nonlocal_dual_graph(
 
       offset += dedge_recv_count[i];
     }
-    // local connections are possibly introduced again by remote -> remove
-    // duplicates
+
+    // Local connections are possibly introduced again by remote ->
+    // remove duplicates
     std::size_t duplicates_count = 0;
     for (std::size_t node = 0; node < offsets.size() - 1; node++)
     {
@@ -530,10 +532,11 @@ std::tuple<graph::AdjacencyList<std::int32_t>, std::vector<std::int64_t>, int,
 mesh::build_local_dual_graph(
     std::span<const CellType> celltypes,
     const std::vector<std::span<const std::int64_t>>& cells,
-    std::optional<std::int32_t> max_facet_to_cell_links)
+    std::optional<std::int32_t> max_facet_to_cell_links, int num_threads)
 {
-  spdlog::info("Build local part of mesh dual graph (mixed)");
-  common::Timer timer("Compute local part of mesh dual graph (mixed)");
+  num_threads = 0;
+  spdlog::info("Build local part of mesh dual graph");
+  common::Timer timer("Compute local part of mesh dual graph");
 
   if (std::size_t ncells_local
       = std::accumulate(cells.begin(), cells.end(), 0,
@@ -558,6 +561,8 @@ mesh::build_local_dual_graph(
   //    number of vertices per facet -> size computations for later on
   //    used data structures
 
+  common::Timer timer0("Compute local part of mesh dual graph: 0");
+
   // TODO: cell_offsets can be removed?
   std::vector<std::int32_t> cell_offsets{0};
   cell_offsets.reserve(cells.size() + 1);
@@ -585,6 +590,9 @@ mesh::build_local_dual_graph(
         { max = std::max(max, cell_facets.num_links(node)); });
   }
 
+  timer0.stop();
+  timer0.flush();
+
   // 2) Build a list of (all) facets, defined by sorted vertices, with
   //    the connected cell index after the vertices. For v_ij the j-th
   //    vertex of the i-th facet. The last index is the cell index (non
@@ -594,47 +602,133 @@ mesh::build_local_dual_graph(
   //             ⋮     ⋮      ⋮    ⋮   ⋱    ⋮  ⋮
   //           v_n1, v_n2,   -1, -1, ..., -1, n]
 
-  const int shape1 = max_vertices_per_facet + 1;
-  std::vector<std::int64_t> facets;
-  facets.reserve(facet_count * shape1);
-  constexpr std::int32_t padding_value = -1;
+  common::Timer timer1("Compute local part of mesh dual graph: 1");
 
-  for (std::size_t j = 0; j < cells.size(); ++j)
+  auto build_facets_fn
+      = [](int shape1, int num_cell_vertices, std::size_t cell_offset,
+           const graph::AdjacencyList<int>& cell_facets,
+           std::span<const std::int64_t> cells, std::span<std::int64_t> facets)
   {
-    const CellType& cell_type = celltypes[j];
-    std::span _cells = cells[j];
-
-    int num_cell_vertices = mesh::cell_num_entities(cell_type, 0);
-    std::int32_t num_cells = _cells.size() / num_cell_vertices;
-    graph::AdjacencyList<int> cell_facets
-        = mesh::get_entity_vertices(cell_type, tdim - 1);
-
-    for (std::int32_t c = 0; c < num_cells; ++c)
+    constexpr std::int32_t padding_value = -1;
+    for (std::size_t c = 0; c < cells.size() / num_cell_vertices; ++c)
     {
       // Loop over cell facets
-      std::span v = _cells.subspan(num_cell_vertices * c, num_cell_vertices);
+      auto v = cells.subspan(num_cell_vertices * c, num_cell_vertices);
       for (int f = 0; f < cell_facets.num_nodes(); ++f)
       {
         std::span facet_vertices = cell_facets.links(f);
-        std::ranges::transform(facet_vertices, std::back_inserter(facets),
+        auto facet_c = facets.subspan(
+            (c * cell_facets.num_nodes() + f) * shape1, shape1);
+        std::ranges::transform(facet_vertices, facet_c.begin(),
                                [v](auto idx) { return v[idx]; });
-        std::sort(std::prev(facets.end(), facet_vertices.size()), facets.end());
-        facets.insert(facets.end(),
-                      max_vertices_per_facet - facet_vertices.size(),
-                      padding_value);
-        facets.push_back(c + cell_offsets[j]);
+
+        auto it = std::next(facet_c.begin(), facet_vertices.size());
+        std::sort(facet_c.begin(), it);
+        std::fill(it, facet_c.end(), padding_value);
+        facet_c.back() = c + cell_offset;
       }
     }
+  };
+
+  const int shape1 = max_vertices_per_facet + 1;
+  std::vector<std::int64_t> facets(facet_count * shape1);
+  for (std::size_t j = 0; j < cells.size(); ++j)
+  {
+    CellType cell_type = celltypes[j];
+    int num_cell_vertices = mesh::cell_num_entities(cell_type, 0);
+    graph::AdjacencyList<int> cell_facets
+        = mesh::get_entity_vertices(cell_type, tdim - 1);
+    std::span _cells = cells[j];
+    if (num_threads > 0)
+    {
+      std::vector<std::jthread> threads;
+      for (int i = 0; i < num_threads; ++i)
+      {
+        auto [c0, c1] = dolfinx::MPI::local_range(
+            i, _cells.size() / num_cell_vertices, num_threads);
+        threads.emplace_back(
+            build_facets_fn, shape1, num_cell_vertices, cell_offsets[j] + c0,
+            std::cref(cell_facets),
+            _cells.subspan(c0 * num_cell_vertices,
+                           (c1 - c0) * num_cell_vertices),
+            std::span(facets.data() + (c0 * cell_facets.num_nodes()) * shape1,
+                      ((c1 - c0) * cell_facets.num_nodes()) * shape1));
+      }
+    }
+    else
+    {
+      build_facets_fn(shape1, num_cell_vertices, cell_offsets[j],
+                      std::cref(cell_facets), _cells, facets);
+    }
   }
 
+  timer1.stop();
+  timer1.flush();
+
+  // const int shape1 = max_vertices_per_facet + 1;
+  // std::vector<std::int64_t> facets;
+  // facets.reserve(facet_count * shape1);
+  // constexpr std::int32_t padding_value = -1;
+
+  // for (std::size_t j = 0; j < cells.size(); ++j)
+  // {
+  //   const CellType& cell_type = celltypes[j];
+  //   std::span _cells = cells[j];
+
+  //   int num_cell_vertices = mesh::cell_num_entities(cell_type, 0);
+  //   std::int32_t num_cells = _cells.size() / num_cell_vertices;
+  //   graph::AdjacencyList<int> cell_facets
+  //       = mesh::get_entity_vertices(cell_type, tdim - 1);
+
+  //   for (std::int32_t c = 0; c < num_cells; ++c)
+  //   {
+  //     // Loop over cell facets
+  //     std::span v = _cells.subspan(num_cell_vertices * c, num_cell_vertices);
+  //     for (int f = 0; f < cell_facets.num_nodes(); ++f)
+  //     {
+  //       std::span facet_vertices = cell_facets.links(f);
+  //       std::ranges::transform(facet_vertices, std::back_inserter(facets),
+  //                              [v](auto idx) { return v[idx]; });
+  //       std::sort(std::prev(facets.end(), facet_vertices.size()),
+  //       facets.end()); facets.insert(facets.end(),
+  //                     max_vertices_per_facet - facet_vertices.size(),
+  //                     padding_value);
+  //       facets.push_back(c + cell_offsets[j]);
+  //     }
+  //   }
+  // }
+
   // 3) Sort facets by vertex key
-  std::vector<std::int32_t> perm
-      = dolfinx::sort_by_perm(std::span<const std::int64_t>(facets), shape1);
+  common::Timer timer3("Compute local part of mesh dual graph: 3");
+
+  std::vector<std::size_t> perm;
+  if (num_threads > 0)
+  {
+    perm.resize(facets.size() / shape1, 0);
+    std::iota(perm.begin(), perm.end(), 0);
+    boost::sort::block_indirect_sort(
+        perm.begin(), perm.end(),
+        [facets = std::cref(facets), shape1](auto f0, auto f1)
+        {
+          auto it0 = std::next(facets.get().begin(), f0 * shape1);
+          auto it1 = std::next(facets.get().begin(), f1 * shape1);
+          return std::lexicographical_compare(it0, std::next(it0, shape1), it1,
+                                              std::next(it1, shape1));
+        },
+        num_threads);
+  }
+  else
+    perm = dolfinx::sort_by_perm(std::span<const std::int64_t>(facets), shape1);
+
+  timer3.stop();
+  timer3.flush();
 
   // 4) Iterate over sorted list of facets. Facets shared by more than
   //    one cell lead to a graph edge to be added. Facets that are not
   //    shared are stored as these might be shared by a cell on another
   //    process.
+  common::Timer timer4("Compute local part of mesh dual graph: 4");
+
   std::vector<std::int64_t> unmatched_facets;
   std::vector<std::int32_t> local_cells;
   std::vector<std::array<std::int32_t, 2>> edges;
@@ -659,8 +753,7 @@ mesh::build_local_dual_graph(
 
       std::int32_t cell_count = matching_facets.size();
       assert(cell_count >= 1);
-      if (!max_facet_to_cell_links.has_value()
-          or (cell_count < *max_facet_to_cell_links))
+      if (!max_facet_to_cell_links or cell_count < *max_facet_to_cell_links)
       {
         // Store unmatched facets and the attached cell
         for (std::int32_t i = 0; i < cell_count; i++)
@@ -676,12 +769,12 @@ mesh::build_local_dual_graph(
       // added later). In the range [it, it_next_facet), all
       // combinations are added.
       for (auto facet_a_it = it; facet_a_it != matching_facets.end();
-           facet_a_it++)
+           ++facet_a_it)
       {
         std::span facet_a(facets.data() + *facet_a_it * shape1, shape1);
         std::int32_t cell_a = facet_a.back();
         for (auto facet_b_it = std::next(facet_a_it);
-             facet_b_it != matching_facets.end(); facet_b_it++)
+             facet_b_it != matching_facets.end(); ++facet_b_it)
         {
           std::span facet_b(facets.data() + *facet_b_it * shape1, shape1);
           std::int32_t cell_b = facet_b.back();
@@ -694,13 +787,17 @@ mesh::build_local_dual_graph(
     }
   }
 
+  timer4.stop();
+  timer4.flush();
+
   // 5) Build adjacency list data. Prepare data structure and assemble
   //    into. Important: we have only computed one direction of the dual
   //    edges, we add both forward and backward to the final data
   //    structure.
 
-  std::vector<std::int32_t> num_links(cell_offsets.back(), 0);
+  common::Timer timer5("Compute local part of mesh dual graph: 5");
 
+  std::vector<std::int32_t> num_links(cell_offsets.back(), 0);
   for (auto [a, b] : edges)
   {
     ++num_links[a];
@@ -718,6 +815,9 @@ mesh::build_local_dual_graph(
                           data[pos[e[1]]++] = e[0];
                         });
 
+  timer5.stop();
+  timer5.flush();
+
   return {graph::AdjacencyList(std::move(data), std::move(offsets)),
           std::move(unmatched_facets), max_vertices_per_facet,
           std::move(local_cells)};
@@ -726,14 +826,15 @@ mesh::build_local_dual_graph(
 graph::AdjacencyList<std::int64_t>
 mesh::build_dual_graph(MPI_Comm comm, std::span<const CellType> celltypes,
                        const std::vector<std::span<const std::int64_t>>& cells,
-                       std::optional<std::int32_t> max_facet_to_cell_links)
+                       std::optional<std::int32_t> max_facet_to_cell_links,
+                       int num_threads)
 {
   spdlog::info("Building mesh dual graph");
 
   // Compute local part of dual graph (cells are graph nodes, and edges
   // are connections by facet)
-  auto [local_graph, facets, shape1, fcells]
-      = mesh::build_local_dual_graph(celltypes, cells, max_facet_to_cell_links);
+  auto [local_graph, facets, shape1, fcells] = mesh::build_local_dual_graph(
+      celltypes, cells, max_facet_to_cell_links, num_threads);
 
   // Extend with nonlocal edges and convert to global indices
   graph::AdjacencyList graph
diff --git a/cpp/dolfinx/mesh/graphbuild.h b/cpp/dolfinx/mesh/graphbuild.h
index 7137ca99d67..863d164e49c 100644
--- a/cpp/dolfinx/mesh/graphbuild.h
+++ b/cpp/dolfinx/mesh/graphbuild.h
@@ -33,6 +33,8 @@ enum class CellType : std::int8_t;
 /// Passing std::nullopt (no upper bound) corresponds.
 /// to `max_facet_to_cell_links`=∞, i.e. every facet is considered
 /// unmatched.
+/// @param[in] num_threads Number of threads to use. Use 0 to not launch
+/// threads.
 ///
 /// @return
 /// 1. Local dual graph
@@ -60,7 +62,8 @@ std::tuple<graph::AdjacencyList<std::int32_t>, std::vector<std::int64_t>, int,
            std::vector<std::int32_t>>
 build_local_dual_graph(std::span<const CellType> celltypes,
                        const std::vector<std::span<const std::int64_t>>& cells,
-                       std::optional<std::int32_t> max_facet_to_cell_links);
+                       std::optional<std::int32_t> max_facet_to_cell_links,
+                       int num_threads);
 
 /// @brief Build distributed mesh dual graph (cell-cell connections via
 /// facets) from minimal mesh data.
@@ -83,6 +86,8 @@ build_local_dual_graph(std::span<const CellType> celltypes,
 /// manifold meshes. Passing std::nullopt (no upper bound) corresponds
 /// to `max_facet_to_cell_links`=∞, i.e. every facet is considered
 /// unmatched.
+/// @param[in] num_threads Number of threads to use. Use 0 to not launch
+/// threads.
 ///
 /// @return The dual graph.
 ///
@@ -96,6 +101,7 @@ build_local_dual_graph(std::span<const CellType> celltypes,
 graph::AdjacencyList<std::int64_t>
 build_dual_graph(MPI_Comm comm, std::span<const CellType> celltypes,
                  const std::vector<std::span<const std::int64_t>>& cells,
-                 std::optional<std::int32_t> max_facet_to_cell_links);
+                 std::optional<std::int32_t> max_facet_to_cell_links,
+                 int num_threads);
 
 } // namespace dolfinx::mesh
diff --git a/cpp/dolfinx/mesh/topologycomputation.cpp b/cpp/dolfinx/mesh/topologycomputation.cpp
index dd4b16c3971..d12d9d03837 100644
--- a/cpp/dolfinx/mesh/topologycomputation.cpp
+++ b/cpp/dolfinx/mesh/topologycomputation.cpp
@@ -656,7 +656,7 @@ compute_entities_by_key_matching(
     auto sort_threaded = [](const auto& entity_list_sorted,
                             int num_vertices_per_entity, int num_threads)
     {
-      std::vector<std::int32_t> sort_order(
+      std::vector<std::size_t> sort_order(
           entity_list_sorted.size() / num_vertices_per_entity, 0);
       std::iota(sort_order.begin(), sort_order.end(), 0);
       boost::sort::sample_sort(
@@ -675,7 +675,7 @@ compute_entities_by_key_matching(
     };
 
     // Sort the list and label uniquely
-    const std::vector<std::int32_t> sort_order
+    const std::vector<std::size_t> sort_order
         = num_threads == 0
               ? dolfinx::sort_by_perm<std::int32_t, 16>(entity_list_sorted,
                                                         num_vertices_per_entity)
diff --git a/cpp/dolfinx/mesh/utils.cpp b/cpp/dolfinx/mesh/utils.cpp
index 9a3e3f1db7c..d6dc8521bf5 100644
--- a/cpp/dolfinx/mesh/utils.cpp
+++ b/cpp/dolfinx/mesh/utils.cpp
@@ -99,9 +99,9 @@ std::vector<std::int32_t> mesh::exterior_facet_indices(const Topology& topology)
 //------------------------------------------------------------------------------
 mesh::CellPartitionFunction mesh::create_cell_partitioner(
     mesh::GhostMode ghost_mode, const graph::partition_fn& partfn,
-    std::optional<std::int32_t> max_facet_to_cell_links)
+    std::optional<std::int32_t> max_facet_to_cell_links, int num_threads)
 {
-  return [partfn, ghost_mode, max_facet_to_cell_links](
+  return [partfn, ghost_mode, max_facet_to_cell_links, num_threads](
              MPI_Comm comm, int nparts, const std::vector<CellType>& cell_types,
              const std::vector<std::span<const std::int64_t>>& cells)
              -> graph::AdjacencyList<std::int32_t>
@@ -109,8 +109,8 @@ mesh::CellPartitionFunction mesh::create_cell_partitioner(
     spdlog::info("Compute partition of cells across ranks");
 
     // Compute distributed dual graph (for the cells on this process)
-    graph::AdjacencyList dual_graph
-        = build_dual_graph(comm, cell_types, cells, max_facet_to_cell_links);
+    graph::AdjacencyList dual_graph = build_dual_graph(
+        comm, cell_types, cells, max_facet_to_cell_links, num_threads);
 
     // Just flag any kind of ghosting for now
     bool ghosting = (ghost_mode != GhostMode::none);
@@ -122,10 +122,10 @@ mesh::CellPartitionFunction mesh::create_cell_partitioner(
 //-----------------------------------------------------------------------------
 mesh::CellPartitionFunction mesh::create_cell_partitioner(
     mesh::GhostMode ghost_mode,
-    std::optional<std::int32_t> max_facet_to_cell_links)
+    std::optional<std::int32_t> max_facet_to_cell_links, int num_threads)
 {
   return create_cell_partitioner(ghost_mode, &graph::partition_graph,
-                                 max_facet_to_cell_links);
+                                 max_facet_to_cell_links, num_threads);
 }
 //-----------------------------------------------------------------------------
 std::vector<std::int32_t>
diff --git a/cpp/dolfinx/mesh/utils.h b/cpp/dolfinx/mesh/utils.h
index b67c40aa756..d04814a9a83 100644
--- a/cpp/dolfinx/mesh/utils.h
+++ b/cpp/dolfinx/mesh/utils.h
@@ -276,7 +276,7 @@ create_boundary_vertices_fn(const CellReorderFunction& reorder_fn,
       auto [graph, unmatched_facets, max_v, _facet_attached_cells]
           = build_local_dual_graph(std::vector{celltypes[i]},
                                    std::vector{cells1_v_local.back()},
-                                   max_facet_to_cell_links);
+                                   max_facet_to_cell_links, 0);
 
       // Store unmatched_facets for current cell type
       facets.emplace_back(std::move(unmatched_facets), max_v);
@@ -342,7 +342,7 @@ create_boundary_vertices_fn(const CellReorderFunction& reorder_fn,
       }
 
       // Compute row permutation
-      const std::vector<std::int32_t> perm = dolfinx::sort_by_perm(
+      const std::vector<std::size_t> perm = dolfinx::sort_by_perm(
           std::span<const std::int64_t>(facets0), max_v);
 
       // For facets in facets0 that appear only once, store the facet
@@ -965,11 +965,12 @@ entities_to_geometry(const Mesh<T>& mesh, int dim,
 /// @param[in] max_facet_to_cell_links Bound on the number of cells a
 /// facet needs to be connected to to be considered *matched* (not on
 /// boundary for non-branching meshes).
+/// @param[in] num_threads Number of threads to use. Use 0 to not launch
+/// threads.
 /// @return Function that computes the destination ranks for each cell.
-CellPartitionFunction
-create_cell_partitioner(mesh::GhostMode ghost_mode,
-                        const graph::partition_fn& partfn,
-                        std::optional<std::int32_t> max_facet_to_cell_links);
+CellPartitionFunction create_cell_partitioner(
+    mesh::GhostMode ghost_mode, const graph::partition_fn& partfn,
+    std::optional<std::int32_t> max_facet_to_cell_links, int num_threads = 0);
 
 /// @brief Create a function that computes destination rank for mesh
 /// cells on this rank by applying the default graph partitioner to the
@@ -979,11 +980,13 @@ create_cell_partitioner(mesh::GhostMode ghost_mode,
 /// @param[in] max_facet_to_cell_links Bound on the number of cells a
 /// facet needs to be connected to to be considered *matched* (not on
 /// boundary for non-branching meshes).
+/// @param[in] num_threads Number of threads to use. Use 0 to not launch
+/// threads.
 /// @return Function that computes the destination ranks for each cell.
 CellPartitionFunction
 create_cell_partitioner(mesh::GhostMode ghost_mode,
-                        std::optional<std::int32_t> max_facet_to_cell_links
-                        = 2);
+                        std::optional<std::int32_t> max_facet_to_cell_links,
+                        int num_threads = 0);
 
 /// @brief Compute incident entities.
 /// @param[in] topology The topology.
@@ -1037,8 +1040,10 @@ compute_incident_entities(const Topology& topology,
 /// redistributed.
 /// @param[in] max_facet_to_cell_links Bound on the number of cells a
 /// facet can be connected to.
-/// @param[in] reorder_fn Function that reorders (locally) cells that
-/// are owned by this process.
+/// @param[in] num_threads Number of threads to use. Use 0 to not launch
+/// threads.
+/// @param[in] reorder_fn Function that reorders (locally) cells that are
+/// owned by this process.
 /// @return A mesh distributed on the communicator `comm`.
 template <typename U>
 Mesh<typename std::remove_reference_t<typename U::value_type>> create_mesh(
@@ -1048,7 +1053,7 @@ Mesh<typename std::remove_reference_t<typename U::value_type>> create_mesh(
         typename std::remove_reference_t<typename U::value_type>>>& elements,
     MPI_Comm commg, const U& x, std::array<std::size_t, 2> xshape,
     const CellPartitionFunction& partitioner,
-    std::optional<std::int32_t> max_facet_to_cell_links,
+    std::optional<std::int32_t> max_facet_to_cell_links, int num_threads,
     const CellReorderFunction& reorder_fn = graph::reorder_gps)
 {
   assert(cells.size() == elements.size());
@@ -1172,7 +1177,7 @@ Mesh<typename std::remove_reference_t<typename U::value_type>> create_mesh(
                          [](auto& c) { return std::span(c); });
   Topology topology
       = create_topology(comm, celltypes, cells1_v_span, original_idx1_span,
-                        ghost_owners_span, boundary_v, 0);
+                        ghost_owners_span, boundary_v, num_threads);
 
   // Create connectivities required higher-order geometries for creating
   // a Geometry object
@@ -1262,12 +1267,12 @@ Mesh<typename std::remove_reference_t<typename U::value_type>> create_mesh(
         typename std::remove_reference_t<typename U::value_type>>& element,
     MPI_Comm commg, const U& x, std::array<std::size_t, 2> xshape,
     const CellPartitionFunction& partitioner,
-    std::optional<std::int32_t> max_facet_to_cell_links,
+    std::optional<std::int32_t> max_facet_to_cell_links, int num_threads,
     const CellReorderFunction& reorder_fn = graph::reorder_gps)
 {
   return create_mesh(comm, commt, std::vector{cells}, std::vector{element},
                      commg, x, xshape, partitioner, max_facet_to_cell_links,
-                     reorder_fn);
+                     num_threads, reorder_fn);
 }
 
 /// @brief Create a distributed mesh from mesh data using the default
@@ -1301,14 +1306,14 @@ create_mesh(MPI_Comm comm, std::span<const std::int64_t> cells,
   if (dolfinx::MPI::size(comm) == 1)
   {
     return create_mesh(comm, comm, std::vector{cells}, std::vector{elements},
-                       comm, x, xshape, nullptr, max_facet_to_cell_links);
+                       comm, x, xshape, nullptr, max_facet_to_cell_links, 0);
   }
   else
   {
     return create_mesh(
         comm, comm, std::vector{cells}, std::vector{elements}, comm, x, xshape,
-        create_cell_partitioner(ghost_mode, max_facet_to_cell_links),
-        max_facet_to_cell_links);
+        create_cell_partitioner(ghost_mode, max_facet_to_cell_links, 0),
+        max_facet_to_cell_links, 0);
   }
 }
 
diff --git a/cpp/dolfinx/refinement/refine.h b/cpp/dolfinx/refinement/refine.h
index 157d5da4c98..f5fc41c7d4e 100644
--- a/cpp/dolfinx/refinement/refine.h
+++ b/cpp/dolfinx/refinement/refine.h
@@ -74,7 +74,7 @@ create_identity_partitioner(const mesh::Mesh<T>& parent_mesh,
     if (comm == MPI_COMM_NULL)
       return graph::regular_adjacency_list(std::move(destinations), 1);
 
-    auto dual_graph = mesh::build_dual_graph(comm, cell_types, cells, 2);
+    auto dual_graph = mesh::build_dual_graph(comm, cell_types, cells, 2, 0);
     std::vector<std::int32_t> node_disp(MPI::size(comm) + 1, 0);
     std::int32_t local_size = dual_graph.num_nodes();
     MPI_Allgather(&local_size, 1, dolfinx::MPI::mpi_t<std::int32_t>,
@@ -152,7 +152,7 @@ refine(const mesh::Mesh<T>& mesh,
   mesh::Mesh<T> mesh1 = mesh::create_mesh(
       mesh.comm(), mesh.comm(), cell_adj.array(), mesh.geometry().cmap(),
       mesh.comm(), new_vertex_coords, xshape,
-      std::get<mesh::CellPartitionFunction>(partitioner), 2);
+      std::get<mesh::CellPartitionFunction>(partitioner), 2, 0);
 
   // Report the number of refined cells
   const int D = topology->dim();
diff --git a/cpp/dolfinx/refinement/uniform.cpp b/cpp/dolfinx/refinement/uniform.cpp
index 126a12cecc1..7f7a1e3d2dd 100644
--- a/cpp/dolfinx/refinement/uniform.cpp
+++ b/cpp/dolfinx/refinement/uniform.cpp
@@ -297,7 +297,7 @@ refinement::uniform_refine(const mesh::Mesh<T>& mesh,
                                                        mixed_topology.end());
   mesh::Mesh new_mesh = mesh::create_mesh(
       mesh.comm(), mesh.comm(), topo_span, mesh.geometry().cmaps(), mesh.comm(),
-      new_x, {new_x.size() / 3, 3}, partitioner, 2);
+      new_x, {new_x.size() / 3, 3}, partitioner, 2, 0);
 
   return new_mesh;
 }
diff --git a/cpp/dolfinx/refinement/uniform.h b/cpp/dolfinx/refinement/uniform.h
index d90cc96bcf9..33756ab0022 100644
--- a/cpp/dolfinx/refinement/uniform.h
+++ b/cpp/dolfinx/refinement/uniform.h
@@ -19,6 +19,6 @@ template <typename T>
 mesh::Mesh<T>
 uniform_refine(const mesh::Mesh<T>& mesh,
                const mesh::CellPartitionFunction& partitioner
-               = mesh::create_cell_partitioner(mesh::GhostMode::none, 2));
+               = mesh::create_cell_partitioner(mesh::GhostMode::none, 2, 0));
 
 } // namespace dolfinx::refinement
diff --git a/cpp/test/common/sort.cpp b/cpp/test/common/sort.cpp
index e6a60d0298e..a1810679f1e 100644
--- a/cpp/test/common/sort.cpp
+++ b/cpp/test/common/sort.cpp
@@ -106,7 +106,7 @@ TEST_CASE("Test argsort bitset")
   auto generator = std::bind(distribution, engine);
   std::generate(arr.begin(), arr.end(), generator);
 
-  std::vector<std::int32_t> perm
+  std::vector<std::size_t> perm
       = dolfinx::sort_by_perm<std::int32_t>(arr, shape1);
   REQUIRE((int)perm.size() == shape0);
 
diff --git a/cpp/test/fem/form.cpp b/cpp/test/fem/form.cpp
index 348ae15c5eb..18042329c0f 100644
--- a/cpp/test/fem/form.cpp
+++ b/cpp/test/fem/form.cpp
@@ -54,7 +54,7 @@ TEST_CASE("Create Expression/Form (mismatch of mesh geometry)",
   auto mesh = std::make_shared<mesh::Mesh<double>>(mesh::create_box<double>(
       MPI_COMM_WORLD, {{{0.0, 0.0, 0.0}, {0.96, 4.5, 2.0}}}, {2, 4, 5},
       mesh::CellType::hexahedron,
-      mesh::create_cell_partitioner(mesh::GhostMode::none)));
+      mesh::create_cell_partitioner(mesh::GhostMode::none, 2, 0)));
   auto element = basix::create_element<double>(
       basix::element::family::P, basix::cell::type::hexahedron, 1,
       basix::element::lagrange_variant::unset,
diff --git a/cpp/test/matrix.cpp b/cpp/test/matrix.cpp
index 4b0ac30fd87..3a77b82a27b 100644
--- a/cpp/test/matrix.cpp
+++ b/cpp/test/matrix.cpp
@@ -30,7 +30,7 @@ namespace
 template <std::floating_point T>
 la::MatrixCSR<T> create_operator(MPI_Comm comm)
 {
-  auto part = mesh::create_cell_partitioner(mesh::GhostMode::none, 2);
+  auto part = mesh::create_cell_partitioner(mesh::GhostMode::none, 2, 0);
   auto mesh = std::make_shared<mesh::Mesh<T>>(
       mesh::create_box(comm, {{{0.0, 0.0, 0.0}, {1.0, 1.0, 1.0}}}, {12, 12, 12},
                        mesh::CellType::tetrahedron, part));
diff --git a/cpp/test/mesh/branching_manifold.cpp b/cpp/test/mesh/branching_manifold.cpp
index 6ed5fab9bef..29c7f7d91cc 100644
--- a/cpp/test/mesh/branching_manifold.cpp
+++ b/cpp/test/mesh/branching_manifold.cpp
@@ -48,7 +48,7 @@ TEST_CASE("dual_graph_branching")
   {
     // default
     auto [dual_graph, unmatched_facets, max_vertices_per_facet, cell_data]
-        = mesh::build_local_dual_graph(celltypes, {cells}, 2);
+        = mesh::build_local_dual_graph(celltypes, {cells}, 2, 0);
 
     CHECK(dual_graph.num_nodes() == 4);
 
@@ -83,7 +83,7 @@ TEST_CASE("dual_graph_branching")
     // max_facet_to_cell_links = 3
     // Note: additionally facet (2) is now considered unmatched
     auto [dual_graph, unmatched_facets, max_vertices_per_facet, cell_data]
-        = mesh::build_local_dual_graph(celltypes, {cells}, 3);
+        = mesh::build_local_dual_graph(celltypes, {cells}, 3, 0);
 
     CHECK(dual_graph.num_nodes() == 4);
 
@@ -125,7 +125,7 @@ TEST_CASE("dual_graph_branching")
 
       auto [dual_graph, unmatched_facets, max_vertices_per_facet, cell_data]
           = mesh::build_local_dual_graph(celltypes, {cells},
-                                         max_facet_to_cell_links);
+                                         max_facet_to_cell_links, 0);
 
       CHECK(dual_graph.num_nodes() == 4);
 
@@ -184,7 +184,7 @@ TEST_CASE("dual_graph_self_dual")
   {
     auto [dual_graph, unmatched_facets, max_vertices_per_facet, cell_data]
         = mesh::build_local_dual_graph(celltypes, {cells},
-                                       max_facet_to_cell_links);
+                                       max_facet_to_cell_links, 0);
 
     CHECK(max_vertices_per_facet == 1);
     CHECK(dual_graph.num_nodes() == 3);
@@ -254,7 +254,7 @@ TEST_CASE("dual_graph_branching_parallel")
     // Check local dual graphs.
 
     auto [dual_graph, unmatched_facets, max_vertices_per_facet, cell_data]
-        = mesh::build_local_dual_graph(celltypes, {cells}, 3);
+        = mesh::build_local_dual_graph(celltypes, {cells}, 3, 0);
 
     CHECK(max_vertices_per_facet == 1);
 
@@ -283,7 +283,7 @@ TEST_CASE("dual_graph_branching_parallel")
   }
 
   auto dual_graph = mesh::build_dual_graph(
-      comm, celltypes, std::vector<std::span<const std::int64_t>>{cells}, 3);
+      comm, celltypes, std::vector<std::span<const std::int64_t>>{cells}, 3, 0);
 
   if (dolfinx::MPI::rank(comm) == 0)
   {
diff --git a/cpp/test/mesh/distributed_mesh.cpp b/cpp/test/mesh/distributed_mesh.cpp
index d51b32b929d..6eeb1659228 100644
--- a/cpp/test/mesh/distributed_mesh.cpp
+++ b/cpp/test/mesh/distributed_mesh.cpp
@@ -27,7 +27,7 @@ constexpr int N = 8;
 [[maybe_unused]] void create_mesh_file(MPI_Comm comm)
 {
   // Create mesh using all processes and save xdmf
-  auto part = mesh::create_cell_partitioner(mesh::GhostMode::shared_facet);
+  auto part = mesh::create_cell_partitioner(mesh::GhostMode::shared_facet, 2);
   auto mesh = std::make_shared<mesh::Mesh<double>>(
       mesh::create_rectangle(comm, {{{0.0, 0.0}, {1.0, 1.0}}}, {N, N},
                              mesh::CellType::triangle, part));
@@ -139,7 +139,7 @@ void test_distributed_mesh(const mesh::CellPartitionFunction& partitioner)
 
   // Build mesh
   mesh::Mesh mesh = mesh::create_mesh(comm, subset_comm, cells, cmap, comm, x,
-                                      xshape, partitioner, 2);
+                                      xshape, partitioner, 2, 0);
   auto t = mesh.topology();
   int tdim = t->dim();
   CHECK(t->index_map(tdim)->size_global() == 2 * N * N);
diff --git a/cpp/test/mesh/refinement/interval.cpp b/cpp/test/mesh/refinement/interval.cpp
index a3297d49de8..503a3f80243 100644
--- a/cpp/test/mesh/refinement/interval.cpp
+++ b/cpp/test/mesh/refinement/interval.cpp
@@ -61,7 +61,7 @@ mesh::Mesh<T> create_3_vertex_interval_mesh()
   return mesh::create_mesh(
       MPI_COMM_SELF, MPI_COMM_SELF, cells, element, MPI_COMM_SELF, x,
       {x.size() / 3, 3},
-      mesh::create_cell_partitioner(mesh::GhostMode::none, 2), 2);
+      mesh::create_cell_partitioner(mesh::GhostMode::none, 2), 2, 0);
 }
 
 TEMPLATE_TEST_CASE("Interval uniform refinement",
@@ -115,7 +115,7 @@ TEMPLATE_TEST_CASE("Interval adaptive refinement",
   // TODO: parent_facet
   auto [refined_mesh, parent_edge, parent_facet] = refinement::refine(
       mesh, std::span(edges),
-      mesh::create_cell_partitioner(mesh::GhostMode::shared_facet),
+      mesh::create_cell_partitioner(mesh::GhostMode::shared_facet, 2),
       refinement::Option::parent_cell);
 
   std::vector<T> expected_x = {
@@ -185,7 +185,7 @@ TEMPLATE_TEST_CASE("Interval Refinement (parallel)",
 
     MPI_Comm commt = rank == 0 ? MPI_COMM_SELF : MPI_COMM_NULL;
     return mesh::create_mesh(MPI_COMM_WORLD, commt, cells, element, commt, x,
-                             {x.size() / 3, 3}, partitioner, 2);
+                             {x.size() / 3, 3}, partitioner, 2, 0);
   };
 
   mesh::Mesh<T> mesh = create_mesh();
diff --git a/cpp/test/mesh/refinement/rectangle.cpp b/cpp/test/mesh/refinement/rectangle.cpp
index 8c0534fa951..5686ffceb45 100644
--- a/cpp/test/mesh/refinement/rectangle.cpp
+++ b/cpp/test/mesh/refinement/rectangle.cpp
@@ -98,7 +98,8 @@ plotter.show()
   mesh.topology()->create_entities(1, std::thread::hardware_concurrency());
 
   auto [mesh_fine, parent_cell, parent_facet] = refinement::refine(
-      mesh, std::nullopt, mesh::create_cell_partitioner(mesh::GhostMode::none),
+      mesh, std::nullopt,
+      mesh::create_cell_partitioner(mesh::GhostMode::none, 2),
       refinement::Option::parent_cell_and_facet);
 
   // vertex layout:
diff --git a/python/demo/demo_mixed-topology.py b/python/demo/demo_mixed-topology.py
index 01fbf51a60b..28db34e52e3 100644
--- a/python/demo/demo_mixed-topology.py
+++ b/python/demo/demo_mixed-topology.py
@@ -108,9 +108,9 @@
 hexahedron = coordinate_element(CellType.hexahedron, 1)
 prism = coordinate_element(CellType.prism, 1)
 
-part = create_cell_partitioner(GhostMode.none, 2)  # type: ignore
+part = create_cell_partitioner(GhostMode.none, 2, 0)  # type: ignore
 mesh = create_mesh(
-    MPI.COMM_WORLD, cells_np, [hexahedron._cpp_object, prism._cpp_object], geomx, part, 2
+    MPI.COMM_WORLD, cells_np, [hexahedron._cpp_object, prism._cpp_object], geomx, part, 2, 0
 )
 # -
 
diff --git a/python/dolfinx/io/utils.py b/python/dolfinx/io/utils.py
index 64da295fea3..684e0a89981 100644
--- a/python/dolfinx/io/utils.py
+++ b/python/dolfinx/io/utils.py
@@ -273,6 +273,7 @@ def read_mesh(
             x,
             create_cell_partitioner(ghost_mode, max_facet_to_cell_links),  # type: ignore
             max_facet_to_cell_links,
+            0,
         )
         msh.name = name
         domain = ufl.Mesh(basix_el)
diff --git a/python/dolfinx/mesh.py b/python/dolfinx/mesh.py
index c326acc04b0..26b44b8d17c 100644
--- a/python/dolfinx/mesh.py
+++ b/python/dolfinx/mesh.py
@@ -84,7 +84,7 @@
 
 @singledispatch
 def create_cell_partitioner(
-    part: Callable, mode: GhostMode, max_facet_to_cell_links: int
+    part: Callable, mode: GhostMode, max_facet_to_cell_links: int, num_threads: int = 0
 ) -> Callable:
     """Create a function to partition a mesh.
 
@@ -95,15 +95,17 @@ def create_cell_partitioner(
             facet. Equal to 2 for non-branching manifold meshes.
             ``None`` corresponds to no upper bound on the number of
             possible connections.
+        num_threads: Number of CPU threads to use when creating. If 0,
+            threads are not spawned.
 
     Return:
         Partitioning function.
     """
-    return _cpp.mesh.create_cell_partitioner(part, mode, max_facet_to_cell_links)
+    return _cpp.mesh.create_cell_partitioner(part, mode, max_facet_to_cell_links, num_threads)
 
 
 @create_cell_partitioner.register(GhostMode)
-def _(mode: GhostMode, max_facet_to_cell_links: int) -> Callable:
+def _(mode: GhostMode, max_facet_to_cell_links: int, num_threads: int = 0) -> Callable:
     """Create a function to partition a mesh.
 
     Args:
@@ -112,11 +114,13 @@ def _(mode: GhostMode, max_facet_to_cell_links: int) -> Callable:
             facet. Equal to 2 for non-branching manifold meshes.
             ``None`` corresponds to no upper bound on the number of
             possible connections.
+        num_threads: Number of CPU threads to use when creating. If 0,
+            threads are not spawned.
 
     Return:
         Partitioning function.
     """
-    return _cpp.mesh.create_cell_partitioner(mode, max_facet_to_cell_links)
+    return _cpp.mesh.create_cell_partitioner(mode, max_facet_to_cell_links, num_threads)
 
 
 class Topology:
@@ -763,6 +767,7 @@ def create_mesh(
     x: npt.NDArray[np.floating],
     partitioner: Callable | None = None,
     max_facet_to_cell_links: int = 2,
+    num_threads: int = 0,
 ) -> Mesh:
     """Create a mesh from topology and geometry arrays.
 
@@ -778,6 +783,8 @@ def create_mesh(
             cells across MPI ranks.
         max_facet_to_cell_links: Maximum number of cells a facet can
             be connected to.
+        num_threads: Number of CPU threads to use when creating the
+            mesh. If 0, threads are not spawned.
 
     Note:
         If required, the coordinates ``x`` will be cast to the same
@@ -830,7 +837,7 @@ def create_mesh(
     x = np.asarray(x, dtype=dtype, order="C")
     cells = np.asarray(cells, dtype=np.int64, order="C")
     msh: _cpp.mesh.Mesh_float32 | _cpp.mesh.Mesh_float64 = _cpp.mesh.create_mesh(
-        comm, cells, cmap._cpp_object, x, partitioner, max_facet_to_cell_links
+        comm, cells, cmap._cpp_object, x, partitioner, max_facet_to_cell_links, num_threads
     )
 
     return Mesh(msh, domain)  # type: ignore
@@ -1049,7 +1056,7 @@ def create_rectangle(
         A mesh of a rectangle.
     """
     if partitioner is None and comm.size > 1:
-        partitioner = _cpp.mesh.create_cell_partitioner(ghost_mode, 2)
+        partitioner = create_cell_partitioner(ghost_mode, 2)
     domain = ufl.Mesh(
         basix.ufl.element(
             "Lagrange",
@@ -1119,6 +1126,7 @@ def create_box(
     dtype: npt.DTypeLike = default_real_type,
     ghost_mode=GhostMode.shared_facet,
     partitioner=None,
+    num_threads: int = 0,
 ) -> Mesh:
     """Create a box mesh.
 
@@ -1133,12 +1141,14 @@ def create_box(
         ghost_mode: The ghost mode used in the mesh partitioning.
         partitioner: Function that computes the parallel distribution of
             cells across MPI ranks.
+        num_threads: Number of threads to use when creating the mesh. If
+            0, no threads are launched..
 
     Returns:
         A mesh of a box domain.
     """
     if partitioner is None and comm.size > 1:
-        partitioner = _cpp.mesh.create_cell_partitioner(ghost_mode, 2)
+        partitioner = create_cell_partitioner(ghost_mode, 2, num_threads)
     domain = ufl.Mesh(
         basix.ufl.element(
             "Lagrange",
diff --git a/python/dolfinx/wrappers/dolfinx_wrappers/mesh.h b/python/dolfinx/wrappers/dolfinx_wrappers/mesh.h
index a5e805d3b81..c792b23ee38 100644
--- a/python/dolfinx/wrappers/dolfinx_wrappers/mesh.h
+++ b/python/dolfinx/wrappers/dolfinx_wrappers/mesh.h
@@ -77,7 +77,6 @@ create_cell_partitioner_cpp(const PythonCellPartitionFunction& p);
 
 namespace dolfinx_wrappers
 {
-
 template <typename T>
 void declare_meshtags(nb::module_& m, const std::string& type)
 {
@@ -301,7 +300,7 @@ void declare_mesh(nb::module_& m, std::string type)
            const std::vector<dolfinx::fem::CoordinateElement<T>>& elements,
            nb::ndarray<const T, nb::c_contig> x,
            const part::impl::PythonCellPartitionFunction& p,
-           std::optional<std::int32_t> max_facet_to_cell_links)
+           std::optional<std::int32_t> max_facet_to_cell_links, int num_threads)
         {
           std::size_t shape1 = x.ndim() == 1 ? 1 : x.shape(1);
 
@@ -330,13 +329,13 @@ void declare_mesh(nb::module_& m, std::string type)
             return dolfinx::mesh::create_mesh(
                 comm.get(), comm.get(), cells, elements, comm.get(),
                 std::span(x.data(), x.size()), {x.shape(0), shape1}, p_wrap,
-                max_facet_to_cell_links);
+                max_facet_to_cell_links, num_threads);
           }
           else
             return dolfinx::mesh::create_mesh(
                 comm.get(), comm.get(), cells, elements, comm.get(),
                 std::span(x.data(), x.size()), {x.shape(0), shape1}, nullptr,
-                max_facet_to_cell_links);
+                max_facet_to_cell_links, num_threads);
         });
 
   m.def(
@@ -346,7 +345,7 @@ void declare_mesh(nb::module_& m, std::string type)
          const dolfinx::fem::CoordinateElement<T>& element,
          nb::ndarray<const T, nb::c_contig> x,
          const part::impl::PythonCellPartitionFunction& p,
-         std::optional<std::int32_t> max_facet_to_cell_links)
+         std::optional<std::int32_t> max_facet_to_cell_links, int num_threads)
       {
         std::size_t shape1 = x.ndim() == 1 ? 1 : x.shape(1);
         if (p)
@@ -369,19 +368,21 @@ void declare_mesh(nb::module_& m, std::string type)
           return dolfinx::mesh::create_mesh(
               comm.get(), comm.get(), std::span(cells.data(), cells.size()),
               element, comm.get(), std::span(x.data(), x.size()),
-              {x.shape(0), shape1}, p_wrap, max_facet_to_cell_links);
+              {x.shape(0), shape1}, p_wrap, max_facet_to_cell_links,
+              num_threads);
         }
         else
         {
           return dolfinx::mesh::create_mesh(
               comm.get(), comm.get(), std::span(cells.data(), cells.size()),
               element, comm.get(), std::span(x.data(), x.size()),
-              {x.shape(0), shape1}, nullptr, max_facet_to_cell_links);
+              {x.shape(0), shape1}, nullptr, max_facet_to_cell_links,
+              num_threads);
         }
       },
       nb::arg("comm"), nb::arg("cells"), nb::arg("element"),
       nb::arg("x").noconvert(), nb::arg("partitioner").none(),
-      nb::arg("max_facet_to_cell_links").none(),
+      nb::arg("max_facet_to_cell_links").none(), nb::arg("num_threads"),
       "Helper function for creating meshes.");
   m.def(
       "create_submesh",
diff --git a/python/dolfinx/wrappers/mesh.cpp b/python/dolfinx/wrappers/mesh.cpp
index d86035de9bd..cfb276432f3 100644
--- a/python/dolfinx/wrappers/mesh.cpp
+++ b/python/dolfinx/wrappers/mesh.cpp
@@ -105,14 +105,16 @@ void mesh(nb::module_& m)
       "build_dual_graph",
       [](const MPICommWrapper comm, dolfinx::mesh::CellType cell_type,
          const dolfinx::graph::AdjacencyList<std::int64_t>& cells,
-         std::optional<std::int32_t> max_facet_to_cell_links)
+         std::optional<std::int32_t> max_facet_to_cell_links, int num_threads)
       {
         std::vector<dolfinx::mesh::CellType> c = {cell_type};
         return dolfinx::mesh::build_dual_graph(
-            comm.get(), std::span{c}, {cells.array()}, max_facet_to_cell_links);
+            comm.get(), std::span{c}, {cells.array()}, max_facet_to_cell_links,
+            num_threads);
       },
       nb::arg("comm"), nb::arg("cell_type"), nb::arg("cells"),
-      nb::arg("max_facet_to_cell_links").none(), "Build dual graph for cells");
+      nb::arg("max_facet_to_cell_links").none(), nb::arg("num_threads"),
+      "Build dual graph for cells");
 
   m.def(
       "build_dual_graph",
@@ -120,7 +122,7 @@ void mesh(nb::module_& m)
          std::vector<dolfinx::mesh::CellType>& cell_types,
          const std::vector<
              nb::ndarray<const std::int64_t, nb::ndim<1>, nb::c_contig>>& cells,
-         std::optional<std::int32_t> max_facet_to_cell_links)
+         std::optional<std::int32_t> max_facet_to_cell_links, int num_threads)
       {
         std::vector<std::span<const std::int64_t>> cell_span(cells.size());
         for (std::size_t i = 0; i < cells.size(); ++i)
@@ -129,10 +131,12 @@ void mesh(nb::module_& m)
               = std::span<const std::int64_t>(cells[i].data(), cells[i].size());
         }
         return dolfinx::mesh::build_dual_graph(
-            comm.get(), cell_types, cell_span, max_facet_to_cell_links);
+            comm.get(), cell_types, cell_span, max_facet_to_cell_links,
+            num_threads);
       },
       nb::arg("comm"), nb::arg("cell_types"), nb::arg("cells"),
-      nb::arg("max_facet_to_cell_links").none(), "Build dual graph for cells");
+      nb::arg("max_facet_to_cell_links").none(), nb::arg("num_threads"),
+      "Build dual graph for cells");
 
   // dolfinx::mesh::GhostMode enums
   nb::enum_<dolfinx::mesh::GhostMode>(m, "GhostMode")
@@ -339,16 +343,16 @@ void mesh(nb::module_& m)
   m.def(
       "create_cell_partitioner",
       [](dolfinx::mesh::GhostMode mode,
-         std::optional<std::int32_t> max_facet_to_cell_links)
-          -> part::impl::PythonCellPartitionFunction
+         std::optional<std::int32_t> max_facet_to_cell_links,
+         int num_threads) -> part::impl::PythonCellPartitionFunction
       {
         return part::impl::create_cell_partitioner_py(
             dolfinx::mesh::create_cell_partitioner(
-                mode, &dolfinx::graph::partition_graph,
-                max_facet_to_cell_links));
+                mode, &dolfinx::graph::partition_graph, max_facet_to_cell_links,
+                num_threads));
       },
       nb::arg("mode"), nb::arg("max_facet_to_cell_links").none(),
-      "Create default cell partitioner.");
+      nb::arg("num_threads"), "Create a default cell partitioner.");
   m.def(
       "create_cell_partitioner",
       [](const std::function<dolfinx::graph::AdjacencyList<std::int32_t>(
@@ -356,16 +360,16 @@ void mesh(nb::module_& m)
              const dolfinx::graph::AdjacencyList<std::int64_t>& local_graph,
              bool ghosting)>& part,
          dolfinx::mesh::GhostMode mode,
-         std::optional<std::int32_t> max_facet_to_cell_links)
-          -> part::impl::PythonCellPartitionFunction
+         std::optional<std::int32_t> max_facet_to_cell_links,
+         int num_threads) -> part::impl::PythonCellPartitionFunction
       {
         return part::impl::create_cell_partitioner_py(
             dolfinx::mesh::create_cell_partitioner(
                 mode, part::impl::create_partitioner_cpp(part),
-                max_facet_to_cell_links));
+                max_facet_to_cell_links, num_threads));
       },
       nb::arg("part"), nb::arg("ghost_mode"),
-      nb::arg("max_facet_to_cell_links").none(),
+      nb::arg("max_facet_to_cell_links").none(), nb::arg("num_threads"),
       "Create a cell partitioner from a graph partitioning function.");
 
   m.def(
diff --git a/python/test/unit/mesh/test_dual_graph.py b/python/test/unit/mesh/test_dual_graph.py
index a4ce550d949..42f10013245 100644
--- a/python/test/unit/mesh/test_dual_graph.py
+++ b/python/test/unit/mesh/test_dual_graph.py
@@ -26,7 +26,7 @@ def test_dgrsph_1d():
     # Circular chain of interval cells
     cells = [[n0, n0 + 1], [n0 + 1, n0 + 2], [n0 + 2, x]]
     w = mesh.build_dual_graph(
-        MPI.COMM_WORLD, mesh.CellType.interval, to_adj(cells, np.int64)._cpp_object, 2
+        MPI.COMM_WORLD, mesh.CellType.interval, to_adj(cells, np.int64)._cpp_object, 2, 0
     )
     assert w.num_nodes == 3
     for i in range(w.num_nodes):