diff --git a/cpp/demo/biharmonic/main.cpp b/cpp/demo/biharmonic/main.cpp index 77277910836..88644089380 100644 --- a/cpp/demo/biharmonic/main.cpp +++ b/cpp/demo/biharmonic/main.cpp @@ -157,7 +157,7 @@ int main(int argc, char* argv[]) PetscInitialize(&argc, &argv, nullptr, nullptr); { // Create mesh - auto part = mesh::create_cell_partitioner(mesh::GhostMode::shared_facet); + auto part = mesh::create_cell_partitioner(mesh::GhostMode::shared_facet, 2); auto mesh = std::make_shared>( mesh::create_rectangle(MPI_COMM_WORLD, {{{0.0, 0.0}, {1.0, 1.0}}}, {32, 32}, mesh::CellType::triangle, part)); diff --git a/cpp/demo/codim_0_assembly/main.cpp b/cpp/demo/codim_0_assembly/main.cpp index 5be7086e5e2..caeb81afbf0 100644 --- a/cpp/demo/codim_0_assembly/main.cpp +++ b/cpp/demo/codim_0_assembly/main.cpp @@ -35,7 +35,7 @@ int main(int argc, char* argv[]) auto mesh = std::make_shared>(mesh::create_rectangle( MPI_COMM_WORLD, {{{0.0, 0.0}, {2.0, 1.0}}}, {1, 4}, mesh::CellType::quadrilateral, - mesh::create_cell_partitioner(mesh::GhostMode::shared_facet))); + mesh::create_cell_partitioner(mesh::GhostMode::shared_facet, 2))); basix::FiniteElement element = basix::create_element( basix::element::family::P, basix::cell::type::quadrilateral, 1, diff --git a/cpp/demo/hyperelasticity/main.cpp b/cpp/demo/hyperelasticity/main.cpp index df95ae8b51b..2978e7dc58c 100644 --- a/cpp/demo/hyperelasticity/main.cpp +++ b/cpp/demo/hyperelasticity/main.cpp @@ -159,7 +159,7 @@ int main(int argc, char* argv[]) auto mesh = std::make_shared>(mesh::create_box( MPI_COMM_WORLD, {{{0.0, 0.0, 0.0}, {1.0, 1.0, 1.0}}}, {10, 10, 10}, mesh::CellType::tetrahedron, - mesh::create_cell_partitioner(mesh::GhostMode::none))); + mesh::create_cell_partitioner(mesh::GhostMode::none, 2))); auto element = basix::create_element( basix::element::family::P, basix::cell::type::tetrahedron, 1, diff --git a/cpp/demo/interpolation-io/main.cpp b/cpp/demo/interpolation-io/main.cpp index 2fe2625e0c4..50666a56c7e 100644 --- a/cpp/demo/interpolation-io/main.cpp +++ b/cpp/demo/interpolation-io/main.cpp @@ -209,7 +209,7 @@ int main(int argc, char* argv[]) = std::make_shared>(mesh::create_rectangle( MPI_COMM_WORLD, {{{0.0, 0.0}, {1.0, 1.0}}}, {32, 4}, mesh::CellType::triangle, - mesh::create_cell_partitioner(mesh::GhostMode::none))); + mesh::create_cell_partitioner(mesh::GhostMode::none, 2))); // Create mesh using same topology as mesh0, but with different // scalar type for geometry @@ -217,7 +217,7 @@ int main(int argc, char* argv[]) = std::make_shared>(mesh::create_rectangle( MPI_COMM_WORLD, {{{0.0, 0.0}, {1.0, 1.0}}}, {32, 4}, mesh::CellType::triangle, - mesh::create_cell_partitioner(mesh::GhostMode::none))); + mesh::create_cell_partitioner(mesh::GhostMode::none, 2))); // Interpolate a function in a scalar Lagrange space and output the // result to file for visualisation using different types diff --git a/cpp/demo/poisson/main.cpp b/cpp/demo/poisson/main.cpp index 2684403557f..e33a3aba712 100644 --- a/cpp/demo/poisson/main.cpp +++ b/cpp/demo/poisson/main.cpp @@ -111,7 +111,7 @@ int main(int argc, char* argv[]) { // Create mesh and function space - auto part = mesh::create_cell_partitioner(mesh::GhostMode::shared_facet); + auto part = mesh::create_cell_partitioner(mesh::GhostMode::shared_facet, 2); auto mesh = std::make_shared>( mesh::create_rectangle(MPI_COMM_WORLD, {{{0.0, 0.0}, {2.0, 1.0}}}, {32, 16}, mesh::CellType::triangle, part)); diff --git a/cpp/demo/poisson_matrix_free/main.cpp b/cpp/demo/poisson_matrix_free/main.cpp index 7e199f32c00..2181d956bcd 100644 --- a/cpp/demo/poisson_matrix_free/main.cpp +++ b/cpp/demo/poisson_matrix_free/main.cpp @@ -138,7 +138,7 @@ void solver(MPI_Comm comm) // Create mesh and function space auto mesh = std::make_shared>(mesh::create_rectangle( comm, {{{0.0, 0.0}, {1.0, 1.0}}}, {10, 10}, mesh::CellType::triangle, - mesh::create_cell_partitioner(mesh::GhostMode::none))); + mesh::create_cell_partitioner(mesh::GhostMode::none, 2))); auto element = basix::create_element( basix::element::family::P, basix::cell::type::triangle, 2, basix::element::lagrange_variant::unset, diff --git a/cpp/dolfinx/common/sort.h b/cpp/dolfinx/common/sort.h index 3dd59aeb9d0..b2faf137be9 100644 --- a/cpp/dolfinx/common/sort.h +++ b/cpp/dolfinx/common/sort.h @@ -174,17 +174,17 @@ constexpr void radix_sort(R&& range, P proj = {}) /// @note This function is suitable for small values of `shape1`. Each /// column of `x` is copied into an array that is then sorted. template -std::vector sort_by_perm(std::span x, std::size_t shape1) +std::vector sort_by_perm(std::span x, std::size_t shape1) { static_assert(std::is_integral_v, "Integral required."); if (x.empty()) - return std::vector{}; + return std::vector{}; assert(shape1 > 0); assert(x.size() % shape1 == 0); const std::size_t shape0 = x.size() / shape1; - std::vector perm(shape0); + std::vector perm(shape0); std::iota(perm.begin(), perm.end(), 0); // Sort by each column, right to left. Col 0 has the most significant diff --git a/cpp/dolfinx/io/VTKHDF.h b/cpp/dolfinx/io/VTKHDF.h index 66c35d152e2..1a7730942ce 100644 --- a/cpp/dolfinx/io/VTKHDF.h +++ b/cpp/dolfinx/io/VTKHDF.h @@ -468,11 +468,11 @@ mesh::Mesh read_mesh(MPI_Comm comm, const std::filesystem::path& filename, auto part = create_cell_partitioner(mesh::GhostMode::none, dolfinx::graph::partition_graph, - max_facet_to_cell_links); + max_facet_to_cell_links, 0); std::vector> cells_span(cells_local.begin(), cells_local.end()); return mesh::create_mesh(comm, comm, cells_span, coordinate_elements, comm, points_pruned, {(std::size_t)x_shape[0], gdim}, part, - max_facet_to_cell_links); + max_facet_to_cell_links, 0); } } // namespace dolfinx::io::VTKHDF diff --git a/cpp/dolfinx/mesh/Topology.cpp b/cpp/dolfinx/mesh/Topology.cpp index f6f0a31624d..0c62e757c79 100644 --- a/cpp/dolfinx/mesh/Topology.cpp +++ b/cpp/dolfinx/mesh/Topology.cpp @@ -10,6 +10,8 @@ #include "topologycomputation.h" #include "utils.h" #include +#include +#include #include #include #include @@ -19,6 +21,7 @@ #include #include #include +#include using namespace dolfinx; using namespace dolfinx::mesh; @@ -40,7 +43,8 @@ namespace /// @return Map from global index to sharing ranks for each index in /// indices. The owner rank is the first as the first in the of ranks. graph::AdjacencyList -determine_sharing_ranks(MPI_Comm comm, std::span indices) +determine_sharing_ranks(MPI_Comm comm, std::span indices, + int num_threads) { common::Timer timer("Topology: determine shared index ownership"); @@ -63,7 +67,13 @@ determine_sharing_ranks(MPI_Comm comm, std::span indices) int dest = dolfinx::MPI::index_owner(size, idx, global_range); dest_to_index.push_back({dest, static_cast(dest_to_index.size())}); } - std::ranges::sort(dest_to_index); + if (num_threads > 0) + { + boost::sort::block_indirect_sort(dest_to_index.begin(), + dest_to_index.end(), num_threads); + } + else + std::ranges::sort(dest_to_index); } // Build list of neighbour dest ranks and count number of indices to @@ -135,10 +145,13 @@ determine_sharing_ranks(MPI_Comm comm, std::span indices) // Build {global index, pos, src} list std::vector> indices_list; - for (std::size_t p = 0; p < recv_disp0.size() - 1; ++p) - for (std::int32_t i = recv_disp0[p]; i < recv_disp0[p + 1]; ++i) - indices_list.push_back({recv_buffer0[i], i, int(p)}); - std::ranges::sort(indices_list); + { + common::Timer timer("Topology: XXXXX"); + for (std::size_t p = 0; p < recv_disp0.size() - 1; ++p) + for (std::int32_t i = recv_disp0[p]; i < recv_disp0[p + 1]; ++i) + indices_list.push_back({recv_buffer0[i], i, int(p)}); + std::ranges::sort(indices_list); + } // Find which ranks have each index std::vector num_items_per_dest1(recv_disp0.size() - 1, 0); @@ -286,7 +299,7 @@ determine_sharing_ranks(MPI_Comm comm, std::span indices) std::array, 2> vertex_ownership_groups( const std::vector>& cells_owned, const std::vector>& cells_ghost, - std::span boundary_vertices) + std::span boundary_vertices, int num_threads) { common::Timer timer("Topology: determine vertex ownership groups (owned, " "undetermined, unowned)"); @@ -300,10 +313,17 @@ std::array, 2> vertex_ownership_groups( local_vertex_set.insert(local_vertex_set.end(), c.begin(), c.end()); { - dolfinx::radix_sort(local_vertex_set); + if (num_threads > 0) + { + boost::sort::block_indirect_sort(local_vertex_set.begin(), + local_vertex_set.end(), 10); + } + else + dolfinx::radix_sort(local_vertex_set); auto [unique_end, range_end] = std::ranges::unique(local_vertex_set); local_vertex_set.erase(unique_end, range_end); } + // Build set of ghost cell vertices (attached to a ghost cell) std::vector ghost_vertex_set; ghost_vertex_set.reserve( @@ -313,10 +333,19 @@ std::array, 2> vertex_ownership_groups( ghost_vertex_set.insert(ghost_vertex_set.end(), c.begin(), c.end()); { - dolfinx::radix_sort(ghost_vertex_set); + if (num_threads > 0) + { + + boost::sort::block_indirect_sort(ghost_vertex_set.begin(), + ghost_vertex_set.end(), 10); + } + else + dolfinx::radix_sort(ghost_vertex_set); + auto [unique_end, range_end] = std::ranges::unique(ghost_vertex_set); ghost_vertex_set.erase(unique_end, range_end); } + // Build difference 1: Vertices attached only to owned cells, and // therefore owned by this rank std::vector owned_vertices; @@ -329,7 +358,7 @@ std::array, 2> vertex_ownership_groups( std::ranges::set_difference(ghost_vertex_set, local_vertex_set, std::back_inserter(unowned_vertices)); - // TODO Check this in debug mode only? +#ifndef NDEBUG // Sanity check // No vertices in unowned should also be in boundary... std::vector unowned_vertices_in_error; @@ -341,6 +370,7 @@ std::array, 2> vertex_ownership_groups( throw std::runtime_error( "Adding boundary vertices in ghost cells not allowed."); } +#endif return {std::move(owned_vertices), std::move(unowned_vertices)}; } @@ -711,12 +741,12 @@ std::vector convert_to_local_indexing( std::vector data(g.size()); if (num_threads > 0) { - std::vector threads(num_threads); + std::vector threads; for (int i = 0; i < num_threads; ++i) { auto [c0, c1] = dolfinx::MPI::local_range(i, g.size(), num_threads); - threads[i] = std::jthread(transform, std::span(data.data() + c0, c1 - c0), - g.subspan(c0, c1 - c0), global_to_local); + threads.emplace_back(transform, std::span(data.data() + c0, c1 - c0), + g.subspan(c0, c1 - c0), global_to_local); } } else @@ -1091,17 +1121,17 @@ Topology mesh::create_topology( // Create sets of owned and unowned vertices from the cell ownership // and the list of boundary vertices - auto [owned_vertices, unowned_vertices] - = vertex_ownership_groups(owned_cells, ghost_cells, boundary_vertices); + auto [owned_vertices, unowned_vertices] = vertex_ownership_groups( + owned_cells, ghost_cells, boundary_vertices, num_threads); + + timer1.stop(); + timer1.flush(); // For each vertex whose ownership needs determining, find the sharing // ranks. The first index in the list of ranks for a vertex is the // owner (as determined by determine_sharing_ranks). const graph::AdjacencyList global_vertex_to_ranks - = determine_sharing_ranks(comm, boundary_vertices); - - timer1.stop(); - timer1.flush(); + = determine_sharing_ranks(comm, boundary_vertices, num_threads); // Iterate over vertices that have 'unknown' ownership, and if flagged // as owned by determine_sharing_ranks update ownership status @@ -1136,7 +1166,7 @@ Topology mesh::create_topology( // Number all owned vertices, iterating over vertices cell-wise std::vector local_vertex_indices(owned_vertices.size(), -1); { - common::Timer timer3("Topology: 3 "); + common::Timer timer3("Topology: 3"); std::int32_t v = 0; for (std::span cells_t : cells) { @@ -1153,6 +1183,45 @@ Topology mesh::create_topology( } } + // // NEW: Number all owned vertices + // // TODO: check if this re-ordering affects geometry retrieval + // // performance + // std::vector local_vertex_indices(owned_vertices.size(), -1); + // { + // common::Timer timer3("Topology: 3"); + + // common::Timer timer("Topology: number owned vertices"); + // std::int32_t v = 0; + // for (auto _cells : cells) + // { + // std::vector vertex_data; + // for (auto vtx : _cells) + // vertex_data.push_back(vtx); + // if (num_threads == 0) + // dolfinx::radix_sort(vertex_data); + // else + // { + // boost::sort::block_indirect_sort(vertex_data.begin(), + // vertex_data.end(), + // num_threads); + // } + + // auto ret = std::ranges::unique(vertex_data); + // auto it0 = owned_vertices.cbegin(); + // for (auto it = vertex_data.begin(); it != ret.begin(); ++it) + // { + // if (*it == *it0) + // { + // std::size_t d = std::distance(owned_vertices.cbegin(), it0); + // assert(d < owned_vertices.size()); + // assert(local_vertex_indices[d] == -1); + // local_vertex_indices[d] = v++; + // ++it0; + // } + // } + // } + // } + common::Timer timer4("Topology: 4"); // Compute the global offset for owned (local) vertex indices diff --git a/cpp/dolfinx/mesh/generation.h b/cpp/dolfinx/mesh/generation.h index 11437a882a5..b7890151337 100644 --- a/cpp/dolfinx/mesh/generation.h +++ b/cpp/dolfinx/mesh/generation.h @@ -69,7 +69,7 @@ Mesh build_hex(MPI_Comm comm, MPI_Comm subcomm, std::array, 2> p, std::array n, const CellPartitionFunction& partitioner, - const CellReorderFunction& reorder_fn); + const CellReorderFunction& reorder_fn, int num_threads); template Mesh build_prism(MPI_Comm comm, MPI_Comm subcomm, @@ -117,14 +117,14 @@ Mesh create_box(MPI_Comm comm, MPI_Comm subcomm, } if (!partitioner and dolfinx::MPI::size(comm) > 1) - partitioner = create_cell_partitioner(mesh::GhostMode::none, 2); + partitioner = create_cell_partitioner(mesh::GhostMode::none, 2, 0); switch (celltype) { case CellType::tetrahedron: return impl::build_tet(comm, subcomm, p, n, partitioner, reorder_fn); case CellType::hexahedron: - return impl::build_hex(comm, subcomm, p, n, partitioner, reorder_fn); + return impl::build_hex(comm, subcomm, p, n, partitioner, reorder_fn, 0); case CellType::prism: return impl::build_prism(comm, subcomm, p, n, partitioner, reorder_fn); default: @@ -193,7 +193,7 @@ Mesh create_rectangle(MPI_Comm comm, std::array, 2> p, } if (!partitioner and dolfinx::MPI::size(comm) > 1) - partitioner = create_cell_partitioner(mesh::GhostMode::none, 2); + partitioner = create_cell_partitioner(mesh::GhostMode::none, 2, 0); switch (celltype) { @@ -262,19 +262,19 @@ Mesh create_interval(MPI_Comm comm, std::int64_t n, std::array p, } if (!partitioner and dolfinx::MPI::size(comm) > 1) - partitioner = create_cell_partitioner(ghost_mode, 2); + partitioner = create_cell_partitioner(ghost_mode, 2, 0); fem::CoordinateElement element(CellType::interval, 1); if (dolfinx::MPI::rank(comm) == 0) { auto [x, cells] = impl::create_interval_cells(p, n); return create_mesh(comm, MPI_COMM_SELF, cells, element, MPI_COMM_SELF, x, - {x.size(), 1}, partitioner, 2, reorder_fn); + {x.size(), 1}, partitioner, 2, 0, reorder_fn); } else { return create_mesh(comm, MPI_COMM_NULL, {}, element, MPI_COMM_NULL, - std::vector{}, {0, 1}, partitioner, 2, reorder_fn); + std::vector{}, {0, 1}, partitioner, 2, 0, reorder_fn); } } @@ -309,6 +309,8 @@ template std::vector create_geom(MPI_Comm comm, std::array, 2> p, std::array n) { + common::Timer timer("Build built-in mesh geometry"); + // Extract data auto [p0, p1] = p; const auto [nx, ny, nz] = n; @@ -367,6 +369,7 @@ Mesh build_tet(MPI_Comm comm, MPI_Comm subcomm, { x = create_geom(subcomm, p, n); + common::Timer timer("Build BoxMesh (tetrahedra): topology"); const auto [nx, ny, nz] = n; const std::int64_t n_cells = nx * ny * nz; @@ -398,7 +401,7 @@ Mesh build_tet(MPI_Comm comm, MPI_Comm subcomm, } return create_mesh(comm, subcomm, cells, element, subcomm, x, - {x.size() / 3, 3}, partitioner, 2, reorder_fn); + {x.size() / 3, 3}, partitioner, 2, 0, reorder_fn); } template @@ -407,7 +410,8 @@ build_hex(MPI_Comm comm, MPI_Comm subcomm, std::array, 2> p, std::array n, const CellPartitionFunction& partitioner, const std::function( - const graph::AdjacencyList&)>& reorder_fn) + const graph::AdjacencyList&)>& reorder_fn, + int num_threads) { common::Timer timer("Build BoxMesh (hexahedra)"); std::vector x; @@ -417,6 +421,8 @@ build_hex(MPI_Comm comm, MPI_Comm subcomm, std::array, 2> p, { x = create_geom(subcomm, p, n); + common::Timer timer("Build BoxMesh (hexahedra): topology"); + // Create cuboids const auto [nx, ny, nz] = n; const std::int64_t n_cells = nx * ny * nz; @@ -443,7 +449,8 @@ build_hex(MPI_Comm comm, MPI_Comm subcomm, std::array, 2> p, } return create_mesh(comm, subcomm, cells, element, subcomm, x, - {x.size() / 3, 3}, partitioner, 2, reorder_fn); + {x.size() / 3, 3}, partitioner, 2, num_threads, + reorder_fn); } template @@ -491,7 +498,7 @@ Mesh build_prism(MPI_Comm comm, MPI_Comm subcomm, } return create_mesh(comm, subcomm, cells, element, subcomm, x, - {x.size() / 3, 3}, partitioner, 2, reorder_fn); + {x.size() / 3, 3}, partitioner, 2, 0, reorder_fn); } template @@ -641,12 +648,12 @@ Mesh build_tri(MPI_Comm comm, std::array, 2> p, } return create_mesh(comm, MPI_COMM_SELF, cells, element, MPI_COMM_SELF, x, - {x.size() / 2, 2}, partitioner, 2, reorder_fn); + {x.size() / 2, 2}, partitioner, 2, 0, reorder_fn); } else { return create_mesh(comm, MPI_COMM_NULL, {}, element, MPI_COMM_NULL, - std::vector{}, {0, 2}, partitioner, 2, reorder_fn); + std::vector{}, {0, 2}, partitioner, 2, 0, reorder_fn); } } @@ -690,12 +697,12 @@ Mesh build_quad(MPI_Comm comm, std::array, 2> p, } return create_mesh(comm, MPI_COMM_SELF, cells, element, MPI_COMM_SELF, x, - {x.size() / 2, 2}, partitioner, 2, reorder_fn); + {x.size() / 2, 2}, partitioner, 2, 0, reorder_fn); } else { return create_mesh(comm, MPI_COMM_NULL, {}, element, MPI_COMM_NULL, - std::vector{}, {0, 2}, partitioner, 2, reorder_fn); + std::vector{}, {0, 2}, partitioner, 2, 0, reorder_fn); } } } // namespace impl diff --git a/cpp/dolfinx/mesh/graphbuild.cpp b/cpp/dolfinx/mesh/graphbuild.cpp index e0f3e98ccaa..5871677fe1d 100644 --- a/cpp/dolfinx/mesh/graphbuild.cpp +++ b/cpp/dolfinx/mesh/graphbuild.cpp @@ -7,6 +7,7 @@ #include "graphbuild.h" #include "cell_types.h" #include +#include #include #include #include @@ -17,6 +18,7 @@ #include #include #include +#include #include #include @@ -183,8 +185,8 @@ graph::AdjacencyList compute_nonlocal_dual_graph( } std::ranges::sort(dest_to_index); - // Build list of dest ranks and count number of items (facets+cell) to - // send to each dest post office (by neighbourhood rank) + // Build list of dest ranks and count number of items (facets+cell) + // to send to each dest post office (by neighbourhood rank) for (auto it = dest_to_index.begin(); it != dest_to_index.end();) { const int neigh_rank = dest.size(); @@ -222,8 +224,7 @@ graph::AdjacencyList compute_nonlocal_dual_graph( static_cast(dest.size()) / comm_size, static_cast(src.size()) / comm_size); - // Create neighbourhood communicator for sending data to - // post offices + // Create neighbourhood communicator for sending data to post offices MPI_Comm comm_po_post; MPI_Dist_graph_create_adjacent(comm, src.size(), src.data(), MPI_UNWEIGHTED, dest.size(), dest.data(), MPI_UNWEIGHTED, @@ -286,8 +287,8 @@ graph::AdjacencyList compute_nonlocal_dual_graph( // Search for consecutive facets (-> dual graph edge between cells) // and pack into send buffer. We store for every cell the number of - // matches, the offsets of each cell and the continuous data. - // Note: 'deges' is short for dual edges. + // matches, the offsets of each cell and the continuous data. Note: + // 'deges' is short for dual edges. std::vector dedge_send_count(recv_disp.back()); std::vector dedge_send_displs(dedge_send_count.size() + 1, 0); std::vector dedge_send_data; @@ -498,8 +499,9 @@ graph::AdjacencyList compute_nonlocal_dual_graph( offset += dedge_recv_count[i]; } - // local connections are possibly introduced again by remote -> remove - // duplicates + + // Local connections are possibly introduced again by remote -> + // remove duplicates std::size_t duplicates_count = 0; for (std::size_t node = 0; node < offsets.size() - 1; node++) { @@ -530,10 +532,11 @@ std::tuple, std::vector, int, mesh::build_local_dual_graph( std::span celltypes, const std::vector>& cells, - std::optional max_facet_to_cell_links) + std::optional max_facet_to_cell_links, int num_threads) { - spdlog::info("Build local part of mesh dual graph (mixed)"); - common::Timer timer("Compute local part of mesh dual graph (mixed)"); + num_threads = 0; + spdlog::info("Build local part of mesh dual graph"); + common::Timer timer("Compute local part of mesh dual graph"); if (std::size_t ncells_local = std::accumulate(cells.begin(), cells.end(), 0, @@ -558,6 +561,8 @@ mesh::build_local_dual_graph( // number of vertices per facet -> size computations for later on // used data structures + common::Timer timer0("Compute local part of mesh dual graph: 0"); + // TODO: cell_offsets can be removed? std::vector cell_offsets{0}; cell_offsets.reserve(cells.size() + 1); @@ -585,6 +590,9 @@ mesh::build_local_dual_graph( { max = std::max(max, cell_facets.num_links(node)); }); } + timer0.stop(); + timer0.flush(); + // 2) Build a list of (all) facets, defined by sorted vertices, with // the connected cell index after the vertices. For v_ij the j-th // vertex of the i-th facet. The last index is the cell index (non @@ -594,47 +602,133 @@ mesh::build_local_dual_graph( // ⋮ ⋮ ⋮ ⋮ ⋱ ⋮ ⋮ // v_n1, v_n2, -1, -1, ..., -1, n] - const int shape1 = max_vertices_per_facet + 1; - std::vector facets; - facets.reserve(facet_count * shape1); - constexpr std::int32_t padding_value = -1; + common::Timer timer1("Compute local part of mesh dual graph: 1"); - for (std::size_t j = 0; j < cells.size(); ++j) + auto build_facets_fn + = [](int shape1, int num_cell_vertices, std::size_t cell_offset, + const graph::AdjacencyList& cell_facets, + std::span cells, std::span facets) { - const CellType& cell_type = celltypes[j]; - std::span _cells = cells[j]; - - int num_cell_vertices = mesh::cell_num_entities(cell_type, 0); - std::int32_t num_cells = _cells.size() / num_cell_vertices; - graph::AdjacencyList cell_facets - = mesh::get_entity_vertices(cell_type, tdim - 1); - - for (std::int32_t c = 0; c < num_cells; ++c) + constexpr std::int32_t padding_value = -1; + for (std::size_t c = 0; c < cells.size() / num_cell_vertices; ++c) { // Loop over cell facets - std::span v = _cells.subspan(num_cell_vertices * c, num_cell_vertices); + auto v = cells.subspan(num_cell_vertices * c, num_cell_vertices); for (int f = 0; f < cell_facets.num_nodes(); ++f) { std::span facet_vertices = cell_facets.links(f); - std::ranges::transform(facet_vertices, std::back_inserter(facets), + auto facet_c = facets.subspan( + (c * cell_facets.num_nodes() + f) * shape1, shape1); + std::ranges::transform(facet_vertices, facet_c.begin(), [v](auto idx) { return v[idx]; }); - std::sort(std::prev(facets.end(), facet_vertices.size()), facets.end()); - facets.insert(facets.end(), - max_vertices_per_facet - facet_vertices.size(), - padding_value); - facets.push_back(c + cell_offsets[j]); + + auto it = std::next(facet_c.begin(), facet_vertices.size()); + std::sort(facet_c.begin(), it); + std::fill(it, facet_c.end(), padding_value); + facet_c.back() = c + cell_offset; } } + }; + + const int shape1 = max_vertices_per_facet + 1; + std::vector facets(facet_count * shape1); + for (std::size_t j = 0; j < cells.size(); ++j) + { + CellType cell_type = celltypes[j]; + int num_cell_vertices = mesh::cell_num_entities(cell_type, 0); + graph::AdjacencyList cell_facets + = mesh::get_entity_vertices(cell_type, tdim - 1); + std::span _cells = cells[j]; + if (num_threads > 0) + { + std::vector threads; + for (int i = 0; i < num_threads; ++i) + { + auto [c0, c1] = dolfinx::MPI::local_range( + i, _cells.size() / num_cell_vertices, num_threads); + threads.emplace_back( + build_facets_fn, shape1, num_cell_vertices, cell_offsets[j] + c0, + std::cref(cell_facets), + _cells.subspan(c0 * num_cell_vertices, + (c1 - c0) * num_cell_vertices), + std::span(facets.data() + (c0 * cell_facets.num_nodes()) * shape1, + ((c1 - c0) * cell_facets.num_nodes()) * shape1)); + } + } + else + { + build_facets_fn(shape1, num_cell_vertices, cell_offsets[j], + std::cref(cell_facets), _cells, facets); + } } + timer1.stop(); + timer1.flush(); + + // const int shape1 = max_vertices_per_facet + 1; + // std::vector facets; + // facets.reserve(facet_count * shape1); + // constexpr std::int32_t padding_value = -1; + + // for (std::size_t j = 0; j < cells.size(); ++j) + // { + // const CellType& cell_type = celltypes[j]; + // std::span _cells = cells[j]; + + // int num_cell_vertices = mesh::cell_num_entities(cell_type, 0); + // std::int32_t num_cells = _cells.size() / num_cell_vertices; + // graph::AdjacencyList cell_facets + // = mesh::get_entity_vertices(cell_type, tdim - 1); + + // for (std::int32_t c = 0; c < num_cells; ++c) + // { + // // Loop over cell facets + // std::span v = _cells.subspan(num_cell_vertices * c, num_cell_vertices); + // for (int f = 0; f < cell_facets.num_nodes(); ++f) + // { + // std::span facet_vertices = cell_facets.links(f); + // std::ranges::transform(facet_vertices, std::back_inserter(facets), + // [v](auto idx) { return v[idx]; }); + // std::sort(std::prev(facets.end(), facet_vertices.size()), + // facets.end()); facets.insert(facets.end(), + // max_vertices_per_facet - facet_vertices.size(), + // padding_value); + // facets.push_back(c + cell_offsets[j]); + // } + // } + // } + // 3) Sort facets by vertex key - std::vector perm - = dolfinx::sort_by_perm(std::span(facets), shape1); + common::Timer timer3("Compute local part of mesh dual graph: 3"); + + std::vector perm; + if (num_threads > 0) + { + perm.resize(facets.size() / shape1, 0); + std::iota(perm.begin(), perm.end(), 0); + boost::sort::block_indirect_sort( + perm.begin(), perm.end(), + [facets = std::cref(facets), shape1](auto f0, auto f1) + { + auto it0 = std::next(facets.get().begin(), f0 * shape1); + auto it1 = std::next(facets.get().begin(), f1 * shape1); + return std::lexicographical_compare(it0, std::next(it0, shape1), it1, + std::next(it1, shape1)); + }, + num_threads); + } + else + perm = dolfinx::sort_by_perm(std::span(facets), shape1); + + timer3.stop(); + timer3.flush(); // 4) Iterate over sorted list of facets. Facets shared by more than // one cell lead to a graph edge to be added. Facets that are not // shared are stored as these might be shared by a cell on another // process. + common::Timer timer4("Compute local part of mesh dual graph: 4"); + std::vector unmatched_facets; std::vector local_cells; std::vector> edges; @@ -659,8 +753,7 @@ mesh::build_local_dual_graph( std::int32_t cell_count = matching_facets.size(); assert(cell_count >= 1); - if (!max_facet_to_cell_links.has_value() - or (cell_count < *max_facet_to_cell_links)) + if (!max_facet_to_cell_links or cell_count < *max_facet_to_cell_links) { // Store unmatched facets and the attached cell for (std::int32_t i = 0; i < cell_count; i++) @@ -676,12 +769,12 @@ mesh::build_local_dual_graph( // added later). In the range [it, it_next_facet), all // combinations are added. for (auto facet_a_it = it; facet_a_it != matching_facets.end(); - facet_a_it++) + ++facet_a_it) { std::span facet_a(facets.data() + *facet_a_it * shape1, shape1); std::int32_t cell_a = facet_a.back(); for (auto facet_b_it = std::next(facet_a_it); - facet_b_it != matching_facets.end(); facet_b_it++) + facet_b_it != matching_facets.end(); ++facet_b_it) { std::span facet_b(facets.data() + *facet_b_it * shape1, shape1); std::int32_t cell_b = facet_b.back(); @@ -694,13 +787,17 @@ mesh::build_local_dual_graph( } } + timer4.stop(); + timer4.flush(); + // 5) Build adjacency list data. Prepare data structure and assemble // into. Important: we have only computed one direction of the dual // edges, we add both forward and backward to the final data // structure. - std::vector num_links(cell_offsets.back(), 0); + common::Timer timer5("Compute local part of mesh dual graph: 5"); + std::vector num_links(cell_offsets.back(), 0); for (auto [a, b] : edges) { ++num_links[a]; @@ -718,6 +815,9 @@ mesh::build_local_dual_graph( data[pos[e[1]]++] = e[0]; }); + timer5.stop(); + timer5.flush(); + return {graph::AdjacencyList(std::move(data), std::move(offsets)), std::move(unmatched_facets), max_vertices_per_facet, std::move(local_cells)}; @@ -726,14 +826,15 @@ mesh::build_local_dual_graph( graph::AdjacencyList mesh::build_dual_graph(MPI_Comm comm, std::span celltypes, const std::vector>& cells, - std::optional max_facet_to_cell_links) + std::optional max_facet_to_cell_links, + int num_threads) { spdlog::info("Building mesh dual graph"); // Compute local part of dual graph (cells are graph nodes, and edges // are connections by facet) - auto [local_graph, facets, shape1, fcells] - = mesh::build_local_dual_graph(celltypes, cells, max_facet_to_cell_links); + auto [local_graph, facets, shape1, fcells] = mesh::build_local_dual_graph( + celltypes, cells, max_facet_to_cell_links, num_threads); // Extend with nonlocal edges and convert to global indices graph::AdjacencyList graph diff --git a/cpp/dolfinx/mesh/graphbuild.h b/cpp/dolfinx/mesh/graphbuild.h index 7137ca99d67..863d164e49c 100644 --- a/cpp/dolfinx/mesh/graphbuild.h +++ b/cpp/dolfinx/mesh/graphbuild.h @@ -33,6 +33,8 @@ enum class CellType : std::int8_t; /// Passing std::nullopt (no upper bound) corresponds. /// to `max_facet_to_cell_links`=∞, i.e. every facet is considered /// unmatched. +/// @param[in] num_threads Number of threads to use. Use 0 to not launch +/// threads. /// /// @return /// 1. Local dual graph @@ -60,7 +62,8 @@ std::tuple, std::vector, int, std::vector> build_local_dual_graph(std::span celltypes, const std::vector>& cells, - std::optional max_facet_to_cell_links); + std::optional max_facet_to_cell_links, + int num_threads); /// @brief Build distributed mesh dual graph (cell-cell connections via /// facets) from minimal mesh data. @@ -83,6 +86,8 @@ build_local_dual_graph(std::span celltypes, /// manifold meshes. Passing std::nullopt (no upper bound) corresponds /// to `max_facet_to_cell_links`=∞, i.e. every facet is considered /// unmatched. +/// @param[in] num_threads Number of threads to use. Use 0 to not launch +/// threads. /// /// @return The dual graph. /// @@ -96,6 +101,7 @@ build_local_dual_graph(std::span celltypes, graph::AdjacencyList build_dual_graph(MPI_Comm comm, std::span celltypes, const std::vector>& cells, - std::optional max_facet_to_cell_links); + std::optional max_facet_to_cell_links, + int num_threads); } // namespace dolfinx::mesh diff --git a/cpp/dolfinx/mesh/topologycomputation.cpp b/cpp/dolfinx/mesh/topologycomputation.cpp index dd4b16c3971..d12d9d03837 100644 --- a/cpp/dolfinx/mesh/topologycomputation.cpp +++ b/cpp/dolfinx/mesh/topologycomputation.cpp @@ -656,7 +656,7 @@ compute_entities_by_key_matching( auto sort_threaded = [](const auto& entity_list_sorted, int num_vertices_per_entity, int num_threads) { - std::vector sort_order( + std::vector sort_order( entity_list_sorted.size() / num_vertices_per_entity, 0); std::iota(sort_order.begin(), sort_order.end(), 0); boost::sort::sample_sort( @@ -675,7 +675,7 @@ compute_entities_by_key_matching( }; // Sort the list and label uniquely - const std::vector sort_order + const std::vector sort_order = num_threads == 0 ? dolfinx::sort_by_perm(entity_list_sorted, num_vertices_per_entity) diff --git a/cpp/dolfinx/mesh/utils.cpp b/cpp/dolfinx/mesh/utils.cpp index 9a3e3f1db7c..d6dc8521bf5 100644 --- a/cpp/dolfinx/mesh/utils.cpp +++ b/cpp/dolfinx/mesh/utils.cpp @@ -99,9 +99,9 @@ std::vector mesh::exterior_facet_indices(const Topology& topology) //------------------------------------------------------------------------------ mesh::CellPartitionFunction mesh::create_cell_partitioner( mesh::GhostMode ghost_mode, const graph::partition_fn& partfn, - std::optional max_facet_to_cell_links) + std::optional max_facet_to_cell_links, int num_threads) { - return [partfn, ghost_mode, max_facet_to_cell_links]( + return [partfn, ghost_mode, max_facet_to_cell_links, num_threads]( MPI_Comm comm, int nparts, const std::vector& cell_types, const std::vector>& cells) -> graph::AdjacencyList @@ -109,8 +109,8 @@ mesh::CellPartitionFunction mesh::create_cell_partitioner( spdlog::info("Compute partition of cells across ranks"); // Compute distributed dual graph (for the cells on this process) - graph::AdjacencyList dual_graph - = build_dual_graph(comm, cell_types, cells, max_facet_to_cell_links); + graph::AdjacencyList dual_graph = build_dual_graph( + comm, cell_types, cells, max_facet_to_cell_links, num_threads); // Just flag any kind of ghosting for now bool ghosting = (ghost_mode != GhostMode::none); @@ -122,10 +122,10 @@ mesh::CellPartitionFunction mesh::create_cell_partitioner( //----------------------------------------------------------------------------- mesh::CellPartitionFunction mesh::create_cell_partitioner( mesh::GhostMode ghost_mode, - std::optional max_facet_to_cell_links) + std::optional max_facet_to_cell_links, int num_threads) { return create_cell_partitioner(ghost_mode, &graph::partition_graph, - max_facet_to_cell_links); + max_facet_to_cell_links, num_threads); } //----------------------------------------------------------------------------- std::vector diff --git a/cpp/dolfinx/mesh/utils.h b/cpp/dolfinx/mesh/utils.h index b67c40aa756..d04814a9a83 100644 --- a/cpp/dolfinx/mesh/utils.h +++ b/cpp/dolfinx/mesh/utils.h @@ -276,7 +276,7 @@ create_boundary_vertices_fn(const CellReorderFunction& reorder_fn, auto [graph, unmatched_facets, max_v, _facet_attached_cells] = build_local_dual_graph(std::vector{celltypes[i]}, std::vector{cells1_v_local.back()}, - max_facet_to_cell_links); + max_facet_to_cell_links, 0); // Store unmatched_facets for current cell type facets.emplace_back(std::move(unmatched_facets), max_v); @@ -342,7 +342,7 @@ create_boundary_vertices_fn(const CellReorderFunction& reorder_fn, } // Compute row permutation - const std::vector perm = dolfinx::sort_by_perm( + const std::vector perm = dolfinx::sort_by_perm( std::span(facets0), max_v); // For facets in facets0 that appear only once, store the facet @@ -965,11 +965,12 @@ entities_to_geometry(const Mesh& mesh, int dim, /// @param[in] max_facet_to_cell_links Bound on the number of cells a /// facet needs to be connected to to be considered *matched* (not on /// boundary for non-branching meshes). +/// @param[in] num_threads Number of threads to use. Use 0 to not launch +/// threads. /// @return Function that computes the destination ranks for each cell. -CellPartitionFunction -create_cell_partitioner(mesh::GhostMode ghost_mode, - const graph::partition_fn& partfn, - std::optional max_facet_to_cell_links); +CellPartitionFunction create_cell_partitioner( + mesh::GhostMode ghost_mode, const graph::partition_fn& partfn, + std::optional max_facet_to_cell_links, int num_threads = 0); /// @brief Create a function that computes destination rank for mesh /// cells on this rank by applying the default graph partitioner to the @@ -979,11 +980,13 @@ create_cell_partitioner(mesh::GhostMode ghost_mode, /// @param[in] max_facet_to_cell_links Bound on the number of cells a /// facet needs to be connected to to be considered *matched* (not on /// boundary for non-branching meshes). +/// @param[in] num_threads Number of threads to use. Use 0 to not launch +/// threads. /// @return Function that computes the destination ranks for each cell. CellPartitionFunction create_cell_partitioner(mesh::GhostMode ghost_mode, - std::optional max_facet_to_cell_links - = 2); + std::optional max_facet_to_cell_links, + int num_threads = 0); /// @brief Compute incident entities. /// @param[in] topology The topology. @@ -1037,8 +1040,10 @@ compute_incident_entities(const Topology& topology, /// redistributed. /// @param[in] max_facet_to_cell_links Bound on the number of cells a /// facet can be connected to. -/// @param[in] reorder_fn Function that reorders (locally) cells that -/// are owned by this process. +/// @param[in] num_threads Number of threads to use. Use 0 to not launch +/// threads. +/// @param[in] reorder_fn Function that reorders (locally) cells that are +/// owned by this process. /// @return A mesh distributed on the communicator `comm`. template Mesh> create_mesh( @@ -1048,7 +1053,7 @@ Mesh> create_mesh( typename std::remove_reference_t>>& elements, MPI_Comm commg, const U& x, std::array xshape, const CellPartitionFunction& partitioner, - std::optional max_facet_to_cell_links, + std::optional max_facet_to_cell_links, int num_threads, const CellReorderFunction& reorder_fn = graph::reorder_gps) { assert(cells.size() == elements.size()); @@ -1172,7 +1177,7 @@ Mesh> create_mesh( [](auto& c) { return std::span(c); }); Topology topology = create_topology(comm, celltypes, cells1_v_span, original_idx1_span, - ghost_owners_span, boundary_v, 0); + ghost_owners_span, boundary_v, num_threads); // Create connectivities required higher-order geometries for creating // a Geometry object @@ -1262,12 +1267,12 @@ Mesh> create_mesh( typename std::remove_reference_t>& element, MPI_Comm commg, const U& x, std::array xshape, const CellPartitionFunction& partitioner, - std::optional max_facet_to_cell_links, + std::optional max_facet_to_cell_links, int num_threads, const CellReorderFunction& reorder_fn = graph::reorder_gps) { return create_mesh(comm, commt, std::vector{cells}, std::vector{element}, commg, x, xshape, partitioner, max_facet_to_cell_links, - reorder_fn); + num_threads, reorder_fn); } /// @brief Create a distributed mesh from mesh data using the default @@ -1301,14 +1306,14 @@ create_mesh(MPI_Comm comm, std::span cells, if (dolfinx::MPI::size(comm) == 1) { return create_mesh(comm, comm, std::vector{cells}, std::vector{elements}, - comm, x, xshape, nullptr, max_facet_to_cell_links); + comm, x, xshape, nullptr, max_facet_to_cell_links, 0); } else { return create_mesh( comm, comm, std::vector{cells}, std::vector{elements}, comm, x, xshape, - create_cell_partitioner(ghost_mode, max_facet_to_cell_links), - max_facet_to_cell_links); + create_cell_partitioner(ghost_mode, max_facet_to_cell_links, 0), + max_facet_to_cell_links, 0); } } diff --git a/cpp/dolfinx/refinement/refine.h b/cpp/dolfinx/refinement/refine.h index 157d5da4c98..f5fc41c7d4e 100644 --- a/cpp/dolfinx/refinement/refine.h +++ b/cpp/dolfinx/refinement/refine.h @@ -74,7 +74,7 @@ create_identity_partitioner(const mesh::Mesh& parent_mesh, if (comm == MPI_COMM_NULL) return graph::regular_adjacency_list(std::move(destinations), 1); - auto dual_graph = mesh::build_dual_graph(comm, cell_types, cells, 2); + auto dual_graph = mesh::build_dual_graph(comm, cell_types, cells, 2, 0); std::vector node_disp(MPI::size(comm) + 1, 0); std::int32_t local_size = dual_graph.num_nodes(); MPI_Allgather(&local_size, 1, dolfinx::MPI::mpi_t, @@ -152,7 +152,7 @@ refine(const mesh::Mesh& mesh, mesh::Mesh mesh1 = mesh::create_mesh( mesh.comm(), mesh.comm(), cell_adj.array(), mesh.geometry().cmap(), mesh.comm(), new_vertex_coords, xshape, - std::get(partitioner), 2); + std::get(partitioner), 2, 0); // Report the number of refined cells const int D = topology->dim(); diff --git a/cpp/dolfinx/refinement/uniform.cpp b/cpp/dolfinx/refinement/uniform.cpp index 126a12cecc1..7f7a1e3d2dd 100644 --- a/cpp/dolfinx/refinement/uniform.cpp +++ b/cpp/dolfinx/refinement/uniform.cpp @@ -297,7 +297,7 @@ refinement::uniform_refine(const mesh::Mesh& mesh, mixed_topology.end()); mesh::Mesh new_mesh = mesh::create_mesh( mesh.comm(), mesh.comm(), topo_span, mesh.geometry().cmaps(), mesh.comm(), - new_x, {new_x.size() / 3, 3}, partitioner, 2); + new_x, {new_x.size() / 3, 3}, partitioner, 2, 0); return new_mesh; } diff --git a/cpp/dolfinx/refinement/uniform.h b/cpp/dolfinx/refinement/uniform.h index d90cc96bcf9..33756ab0022 100644 --- a/cpp/dolfinx/refinement/uniform.h +++ b/cpp/dolfinx/refinement/uniform.h @@ -19,6 +19,6 @@ template mesh::Mesh uniform_refine(const mesh::Mesh& mesh, const mesh::CellPartitionFunction& partitioner - = mesh::create_cell_partitioner(mesh::GhostMode::none, 2)); + = mesh::create_cell_partitioner(mesh::GhostMode::none, 2, 0)); } // namespace dolfinx::refinement diff --git a/cpp/test/common/sort.cpp b/cpp/test/common/sort.cpp index e6a60d0298e..a1810679f1e 100644 --- a/cpp/test/common/sort.cpp +++ b/cpp/test/common/sort.cpp @@ -106,7 +106,7 @@ TEST_CASE("Test argsort bitset") auto generator = std::bind(distribution, engine); std::generate(arr.begin(), arr.end(), generator); - std::vector perm + std::vector perm = dolfinx::sort_by_perm(arr, shape1); REQUIRE((int)perm.size() == shape0); diff --git a/cpp/test/fem/form.cpp b/cpp/test/fem/form.cpp index 348ae15c5eb..18042329c0f 100644 --- a/cpp/test/fem/form.cpp +++ b/cpp/test/fem/form.cpp @@ -54,7 +54,7 @@ TEST_CASE("Create Expression/Form (mismatch of mesh geometry)", auto mesh = std::make_shared>(mesh::create_box( MPI_COMM_WORLD, {{{0.0, 0.0, 0.0}, {0.96, 4.5, 2.0}}}, {2, 4, 5}, mesh::CellType::hexahedron, - mesh::create_cell_partitioner(mesh::GhostMode::none))); + mesh::create_cell_partitioner(mesh::GhostMode::none, 2, 0))); auto element = basix::create_element( basix::element::family::P, basix::cell::type::hexahedron, 1, basix::element::lagrange_variant::unset, diff --git a/cpp/test/matrix.cpp b/cpp/test/matrix.cpp index 4b0ac30fd87..3a77b82a27b 100644 --- a/cpp/test/matrix.cpp +++ b/cpp/test/matrix.cpp @@ -30,7 +30,7 @@ namespace template la::MatrixCSR create_operator(MPI_Comm comm) { - auto part = mesh::create_cell_partitioner(mesh::GhostMode::none, 2); + auto part = mesh::create_cell_partitioner(mesh::GhostMode::none, 2, 0); auto mesh = std::make_shared>( mesh::create_box(comm, {{{0.0, 0.0, 0.0}, {1.0, 1.0, 1.0}}}, {12, 12, 12}, mesh::CellType::tetrahedron, part)); diff --git a/cpp/test/mesh/branching_manifold.cpp b/cpp/test/mesh/branching_manifold.cpp index 6ed5fab9bef..29c7f7d91cc 100644 --- a/cpp/test/mesh/branching_manifold.cpp +++ b/cpp/test/mesh/branching_manifold.cpp @@ -48,7 +48,7 @@ TEST_CASE("dual_graph_branching") { // default auto [dual_graph, unmatched_facets, max_vertices_per_facet, cell_data] - = mesh::build_local_dual_graph(celltypes, {cells}, 2); + = mesh::build_local_dual_graph(celltypes, {cells}, 2, 0); CHECK(dual_graph.num_nodes() == 4); @@ -83,7 +83,7 @@ TEST_CASE("dual_graph_branching") // max_facet_to_cell_links = 3 // Note: additionally facet (2) is now considered unmatched auto [dual_graph, unmatched_facets, max_vertices_per_facet, cell_data] - = mesh::build_local_dual_graph(celltypes, {cells}, 3); + = mesh::build_local_dual_graph(celltypes, {cells}, 3, 0); CHECK(dual_graph.num_nodes() == 4); @@ -125,7 +125,7 @@ TEST_CASE("dual_graph_branching") auto [dual_graph, unmatched_facets, max_vertices_per_facet, cell_data] = mesh::build_local_dual_graph(celltypes, {cells}, - max_facet_to_cell_links); + max_facet_to_cell_links, 0); CHECK(dual_graph.num_nodes() == 4); @@ -184,7 +184,7 @@ TEST_CASE("dual_graph_self_dual") { auto [dual_graph, unmatched_facets, max_vertices_per_facet, cell_data] = mesh::build_local_dual_graph(celltypes, {cells}, - max_facet_to_cell_links); + max_facet_to_cell_links, 0); CHECK(max_vertices_per_facet == 1); CHECK(dual_graph.num_nodes() == 3); @@ -254,7 +254,7 @@ TEST_CASE("dual_graph_branching_parallel") // Check local dual graphs. auto [dual_graph, unmatched_facets, max_vertices_per_facet, cell_data] - = mesh::build_local_dual_graph(celltypes, {cells}, 3); + = mesh::build_local_dual_graph(celltypes, {cells}, 3, 0); CHECK(max_vertices_per_facet == 1); @@ -283,7 +283,7 @@ TEST_CASE("dual_graph_branching_parallel") } auto dual_graph = mesh::build_dual_graph( - comm, celltypes, std::vector>{cells}, 3); + comm, celltypes, std::vector>{cells}, 3, 0); if (dolfinx::MPI::rank(comm) == 0) { diff --git a/cpp/test/mesh/distributed_mesh.cpp b/cpp/test/mesh/distributed_mesh.cpp index d51b32b929d..6eeb1659228 100644 --- a/cpp/test/mesh/distributed_mesh.cpp +++ b/cpp/test/mesh/distributed_mesh.cpp @@ -27,7 +27,7 @@ constexpr int N = 8; [[maybe_unused]] void create_mesh_file(MPI_Comm comm) { // Create mesh using all processes and save xdmf - auto part = mesh::create_cell_partitioner(mesh::GhostMode::shared_facet); + auto part = mesh::create_cell_partitioner(mesh::GhostMode::shared_facet, 2); auto mesh = std::make_shared>( mesh::create_rectangle(comm, {{{0.0, 0.0}, {1.0, 1.0}}}, {N, N}, mesh::CellType::triangle, part)); @@ -139,7 +139,7 @@ void test_distributed_mesh(const mesh::CellPartitionFunction& partitioner) // Build mesh mesh::Mesh mesh = mesh::create_mesh(comm, subset_comm, cells, cmap, comm, x, - xshape, partitioner, 2); + xshape, partitioner, 2, 0); auto t = mesh.topology(); int tdim = t->dim(); CHECK(t->index_map(tdim)->size_global() == 2 * N * N); diff --git a/cpp/test/mesh/refinement/interval.cpp b/cpp/test/mesh/refinement/interval.cpp index a3297d49de8..503a3f80243 100644 --- a/cpp/test/mesh/refinement/interval.cpp +++ b/cpp/test/mesh/refinement/interval.cpp @@ -61,7 +61,7 @@ mesh::Mesh create_3_vertex_interval_mesh() return mesh::create_mesh( MPI_COMM_SELF, MPI_COMM_SELF, cells, element, MPI_COMM_SELF, x, {x.size() / 3, 3}, - mesh::create_cell_partitioner(mesh::GhostMode::none, 2), 2); + mesh::create_cell_partitioner(mesh::GhostMode::none, 2), 2, 0); } TEMPLATE_TEST_CASE("Interval uniform refinement", @@ -115,7 +115,7 @@ TEMPLATE_TEST_CASE("Interval adaptive refinement", // TODO: parent_facet auto [refined_mesh, parent_edge, parent_facet] = refinement::refine( mesh, std::span(edges), - mesh::create_cell_partitioner(mesh::GhostMode::shared_facet), + mesh::create_cell_partitioner(mesh::GhostMode::shared_facet, 2), refinement::Option::parent_cell); std::vector expected_x = { @@ -185,7 +185,7 @@ TEMPLATE_TEST_CASE("Interval Refinement (parallel)", MPI_Comm commt = rank == 0 ? MPI_COMM_SELF : MPI_COMM_NULL; return mesh::create_mesh(MPI_COMM_WORLD, commt, cells, element, commt, x, - {x.size() / 3, 3}, partitioner, 2); + {x.size() / 3, 3}, partitioner, 2, 0); }; mesh::Mesh mesh = create_mesh(); diff --git a/cpp/test/mesh/refinement/rectangle.cpp b/cpp/test/mesh/refinement/rectangle.cpp index 8c0534fa951..5686ffceb45 100644 --- a/cpp/test/mesh/refinement/rectangle.cpp +++ b/cpp/test/mesh/refinement/rectangle.cpp @@ -98,7 +98,8 @@ plotter.show() mesh.topology()->create_entities(1, std::thread::hardware_concurrency()); auto [mesh_fine, parent_cell, parent_facet] = refinement::refine( - mesh, std::nullopt, mesh::create_cell_partitioner(mesh::GhostMode::none), + mesh, std::nullopt, + mesh::create_cell_partitioner(mesh::GhostMode::none, 2), refinement::Option::parent_cell_and_facet); // vertex layout: diff --git a/python/demo/demo_mixed-topology.py b/python/demo/demo_mixed-topology.py index 01fbf51a60b..28db34e52e3 100644 --- a/python/demo/demo_mixed-topology.py +++ b/python/demo/demo_mixed-topology.py @@ -108,9 +108,9 @@ hexahedron = coordinate_element(CellType.hexahedron, 1) prism = coordinate_element(CellType.prism, 1) -part = create_cell_partitioner(GhostMode.none, 2) # type: ignore +part = create_cell_partitioner(GhostMode.none, 2, 0) # type: ignore mesh = create_mesh( - MPI.COMM_WORLD, cells_np, [hexahedron._cpp_object, prism._cpp_object], geomx, part, 2 + MPI.COMM_WORLD, cells_np, [hexahedron._cpp_object, prism._cpp_object], geomx, part, 2, 0 ) # - diff --git a/python/dolfinx/io/utils.py b/python/dolfinx/io/utils.py index 64da295fea3..684e0a89981 100644 --- a/python/dolfinx/io/utils.py +++ b/python/dolfinx/io/utils.py @@ -273,6 +273,7 @@ def read_mesh( x, create_cell_partitioner(ghost_mode, max_facet_to_cell_links), # type: ignore max_facet_to_cell_links, + 0, ) msh.name = name domain = ufl.Mesh(basix_el) diff --git a/python/dolfinx/mesh.py b/python/dolfinx/mesh.py index c326acc04b0..26b44b8d17c 100644 --- a/python/dolfinx/mesh.py +++ b/python/dolfinx/mesh.py @@ -84,7 +84,7 @@ @singledispatch def create_cell_partitioner( - part: Callable, mode: GhostMode, max_facet_to_cell_links: int + part: Callable, mode: GhostMode, max_facet_to_cell_links: int, num_threads: int = 0 ) -> Callable: """Create a function to partition a mesh. @@ -95,15 +95,17 @@ def create_cell_partitioner( facet. Equal to 2 for non-branching manifold meshes. ``None`` corresponds to no upper bound on the number of possible connections. + num_threads: Number of CPU threads to use when creating. If 0, + threads are not spawned. Return: Partitioning function. """ - return _cpp.mesh.create_cell_partitioner(part, mode, max_facet_to_cell_links) + return _cpp.mesh.create_cell_partitioner(part, mode, max_facet_to_cell_links, num_threads) @create_cell_partitioner.register(GhostMode) -def _(mode: GhostMode, max_facet_to_cell_links: int) -> Callable: +def _(mode: GhostMode, max_facet_to_cell_links: int, num_threads: int = 0) -> Callable: """Create a function to partition a mesh. Args: @@ -112,11 +114,13 @@ def _(mode: GhostMode, max_facet_to_cell_links: int) -> Callable: facet. Equal to 2 for non-branching manifold meshes. ``None`` corresponds to no upper bound on the number of possible connections. + num_threads: Number of CPU threads to use when creating. If 0, + threads are not spawned. Return: Partitioning function. """ - return _cpp.mesh.create_cell_partitioner(mode, max_facet_to_cell_links) + return _cpp.mesh.create_cell_partitioner(mode, max_facet_to_cell_links, num_threads) class Topology: @@ -763,6 +767,7 @@ def create_mesh( x: npt.NDArray[np.floating], partitioner: Callable | None = None, max_facet_to_cell_links: int = 2, + num_threads: int = 0, ) -> Mesh: """Create a mesh from topology and geometry arrays. @@ -778,6 +783,8 @@ def create_mesh( cells across MPI ranks. max_facet_to_cell_links: Maximum number of cells a facet can be connected to. + num_threads: Number of CPU threads to use when creating the + mesh. If 0, threads are not spawned. Note: If required, the coordinates ``x`` will be cast to the same @@ -830,7 +837,7 @@ def create_mesh( x = np.asarray(x, dtype=dtype, order="C") cells = np.asarray(cells, dtype=np.int64, order="C") msh: _cpp.mesh.Mesh_float32 | _cpp.mesh.Mesh_float64 = _cpp.mesh.create_mesh( - comm, cells, cmap._cpp_object, x, partitioner, max_facet_to_cell_links + comm, cells, cmap._cpp_object, x, partitioner, max_facet_to_cell_links, num_threads ) return Mesh(msh, domain) # type: ignore @@ -1049,7 +1056,7 @@ def create_rectangle( A mesh of a rectangle. """ if partitioner is None and comm.size > 1: - partitioner = _cpp.mesh.create_cell_partitioner(ghost_mode, 2) + partitioner = create_cell_partitioner(ghost_mode, 2) domain = ufl.Mesh( basix.ufl.element( "Lagrange", @@ -1119,6 +1126,7 @@ def create_box( dtype: npt.DTypeLike = default_real_type, ghost_mode=GhostMode.shared_facet, partitioner=None, + num_threads: int = 0, ) -> Mesh: """Create a box mesh. @@ -1133,12 +1141,14 @@ def create_box( ghost_mode: The ghost mode used in the mesh partitioning. partitioner: Function that computes the parallel distribution of cells across MPI ranks. + num_threads: Number of threads to use when creating the mesh. If + 0, no threads are launched.. Returns: A mesh of a box domain. """ if partitioner is None and comm.size > 1: - partitioner = _cpp.mesh.create_cell_partitioner(ghost_mode, 2) + partitioner = create_cell_partitioner(ghost_mode, 2, num_threads) domain = ufl.Mesh( basix.ufl.element( "Lagrange", diff --git a/python/dolfinx/wrappers/dolfinx_wrappers/mesh.h b/python/dolfinx/wrappers/dolfinx_wrappers/mesh.h index a5e805d3b81..c792b23ee38 100644 --- a/python/dolfinx/wrappers/dolfinx_wrappers/mesh.h +++ b/python/dolfinx/wrappers/dolfinx_wrappers/mesh.h @@ -77,7 +77,6 @@ create_cell_partitioner_cpp(const PythonCellPartitionFunction& p); namespace dolfinx_wrappers { - template void declare_meshtags(nb::module_& m, const std::string& type) { @@ -301,7 +300,7 @@ void declare_mesh(nb::module_& m, std::string type) const std::vector>& elements, nb::ndarray x, const part::impl::PythonCellPartitionFunction& p, - std::optional max_facet_to_cell_links) + std::optional max_facet_to_cell_links, int num_threads) { std::size_t shape1 = x.ndim() == 1 ? 1 : x.shape(1); @@ -330,13 +329,13 @@ void declare_mesh(nb::module_& m, std::string type) return dolfinx::mesh::create_mesh( comm.get(), comm.get(), cells, elements, comm.get(), std::span(x.data(), x.size()), {x.shape(0), shape1}, p_wrap, - max_facet_to_cell_links); + max_facet_to_cell_links, num_threads); } else return dolfinx::mesh::create_mesh( comm.get(), comm.get(), cells, elements, comm.get(), std::span(x.data(), x.size()), {x.shape(0), shape1}, nullptr, - max_facet_to_cell_links); + max_facet_to_cell_links, num_threads); }); m.def( @@ -346,7 +345,7 @@ void declare_mesh(nb::module_& m, std::string type) const dolfinx::fem::CoordinateElement& element, nb::ndarray x, const part::impl::PythonCellPartitionFunction& p, - std::optional max_facet_to_cell_links) + std::optional max_facet_to_cell_links, int num_threads) { std::size_t shape1 = x.ndim() == 1 ? 1 : x.shape(1); if (p) @@ -369,19 +368,21 @@ void declare_mesh(nb::module_& m, std::string type) return dolfinx::mesh::create_mesh( comm.get(), comm.get(), std::span(cells.data(), cells.size()), element, comm.get(), std::span(x.data(), x.size()), - {x.shape(0), shape1}, p_wrap, max_facet_to_cell_links); + {x.shape(0), shape1}, p_wrap, max_facet_to_cell_links, + num_threads); } else { return dolfinx::mesh::create_mesh( comm.get(), comm.get(), std::span(cells.data(), cells.size()), element, comm.get(), std::span(x.data(), x.size()), - {x.shape(0), shape1}, nullptr, max_facet_to_cell_links); + {x.shape(0), shape1}, nullptr, max_facet_to_cell_links, + num_threads); } }, nb::arg("comm"), nb::arg("cells"), nb::arg("element"), nb::arg("x").noconvert(), nb::arg("partitioner").none(), - nb::arg("max_facet_to_cell_links").none(), + nb::arg("max_facet_to_cell_links").none(), nb::arg("num_threads"), "Helper function for creating meshes."); m.def( "create_submesh", diff --git a/python/dolfinx/wrappers/mesh.cpp b/python/dolfinx/wrappers/mesh.cpp index d86035de9bd..cfb276432f3 100644 --- a/python/dolfinx/wrappers/mesh.cpp +++ b/python/dolfinx/wrappers/mesh.cpp @@ -105,14 +105,16 @@ void mesh(nb::module_& m) "build_dual_graph", [](const MPICommWrapper comm, dolfinx::mesh::CellType cell_type, const dolfinx::graph::AdjacencyList& cells, - std::optional max_facet_to_cell_links) + std::optional max_facet_to_cell_links, int num_threads) { std::vector c = {cell_type}; return dolfinx::mesh::build_dual_graph( - comm.get(), std::span{c}, {cells.array()}, max_facet_to_cell_links); + comm.get(), std::span{c}, {cells.array()}, max_facet_to_cell_links, + num_threads); }, nb::arg("comm"), nb::arg("cell_type"), nb::arg("cells"), - nb::arg("max_facet_to_cell_links").none(), "Build dual graph for cells"); + nb::arg("max_facet_to_cell_links").none(), nb::arg("num_threads"), + "Build dual graph for cells"); m.def( "build_dual_graph", @@ -120,7 +122,7 @@ void mesh(nb::module_& m) std::vector& cell_types, const std::vector< nb::ndarray, nb::c_contig>>& cells, - std::optional max_facet_to_cell_links) + std::optional max_facet_to_cell_links, int num_threads) { std::vector> cell_span(cells.size()); for (std::size_t i = 0; i < cells.size(); ++i) @@ -129,10 +131,12 @@ void mesh(nb::module_& m) = std::span(cells[i].data(), cells[i].size()); } return dolfinx::mesh::build_dual_graph( - comm.get(), cell_types, cell_span, max_facet_to_cell_links); + comm.get(), cell_types, cell_span, max_facet_to_cell_links, + num_threads); }, nb::arg("comm"), nb::arg("cell_types"), nb::arg("cells"), - nb::arg("max_facet_to_cell_links").none(), "Build dual graph for cells"); + nb::arg("max_facet_to_cell_links").none(), nb::arg("num_threads"), + "Build dual graph for cells"); // dolfinx::mesh::GhostMode enums nb::enum_(m, "GhostMode") @@ -339,16 +343,16 @@ void mesh(nb::module_& m) m.def( "create_cell_partitioner", [](dolfinx::mesh::GhostMode mode, - std::optional max_facet_to_cell_links) - -> part::impl::PythonCellPartitionFunction + std::optional max_facet_to_cell_links, + int num_threads) -> part::impl::PythonCellPartitionFunction { return part::impl::create_cell_partitioner_py( dolfinx::mesh::create_cell_partitioner( - mode, &dolfinx::graph::partition_graph, - max_facet_to_cell_links)); + mode, &dolfinx::graph::partition_graph, max_facet_to_cell_links, + num_threads)); }, nb::arg("mode"), nb::arg("max_facet_to_cell_links").none(), - "Create default cell partitioner."); + nb::arg("num_threads"), "Create a default cell partitioner."); m.def( "create_cell_partitioner", [](const std::function( @@ -356,16 +360,16 @@ void mesh(nb::module_& m) const dolfinx::graph::AdjacencyList& local_graph, bool ghosting)>& part, dolfinx::mesh::GhostMode mode, - std::optional max_facet_to_cell_links) - -> part::impl::PythonCellPartitionFunction + std::optional max_facet_to_cell_links, + int num_threads) -> part::impl::PythonCellPartitionFunction { return part::impl::create_cell_partitioner_py( dolfinx::mesh::create_cell_partitioner( mode, part::impl::create_partitioner_cpp(part), - max_facet_to_cell_links)); + max_facet_to_cell_links, num_threads)); }, nb::arg("part"), nb::arg("ghost_mode"), - nb::arg("max_facet_to_cell_links").none(), + nb::arg("max_facet_to_cell_links").none(), nb::arg("num_threads"), "Create a cell partitioner from a graph partitioning function."); m.def( diff --git a/python/test/unit/mesh/test_dual_graph.py b/python/test/unit/mesh/test_dual_graph.py index a4ce550d949..42f10013245 100644 --- a/python/test/unit/mesh/test_dual_graph.py +++ b/python/test/unit/mesh/test_dual_graph.py @@ -26,7 +26,7 @@ def test_dgrsph_1d(): # Circular chain of interval cells cells = [[n0, n0 + 1], [n0 + 1, n0 + 2], [n0 + 2, x]] w = mesh.build_dual_graph( - MPI.COMM_WORLD, mesh.CellType.interval, to_adj(cells, np.int64)._cpp_object, 2 + MPI.COMM_WORLD, mesh.CellType.interval, to_adj(cells, np.int64)._cpp_object, 2, 0 ) assert w.num_nodes == 3 for i in range(w.num_nodes):