Add block size support to SuperLU_DIST matrix (#4252)

jhale · web-flow · commit 8e6725245f8d · 2026-06-30T14:10:09.000Z
* Add support for block size - no tests yet.

* Reduce comments.

* Add vector Laplacian test

* Add test for non-equal block sizes

* Reduce comment
diff --git a/cpp/dolfinx/la/superlu_dist.cpp b/cpp/dolfinx/la/superlu_dist.cpp
@@ -14,10 +14,12 @@ extern "C"
 #include <superlu_zdefs.h>
 }
 #include <algorithm>
+#include <array>
 #include <dolfinx/common/Timer.h>
 #include <dolfinx/la/MatrixCSR.h>
 #include <dolfinx/la/Vector.h>
 #include <initializer_list>
+#include <numeric>
 #include <ranges>
 #include <stdexcept>
 #include <vector>
@@ -49,25 +51,101 @@ namespace
 template <typename...>
 constexpr bool always_false_v = false;
 
+// Expand MatrixCSR block column indices to flattened column indices.
 std::vector<int_t> col_indices(const auto& A)
 {
-  // Local number of non-zeros
-  std::int32_t m_loc = A.num_owned_rows();
-  std::int64_t nnz_loc = A.row_ptr().at(m_loc);
-
+  std::array<int, 2> bs = A.block_size();
+  std::int32_t m_loc_block = A.num_owned_rows();
+  std::int64_t nnz_loc_block = A.row_ptr().at(m_loc_block);
   std::vector global_indices(A.index_map(1)->global_indices());
-  std::vector<int_t> col_indices(nnz_loc);
-  std::transform(A.cols().begin(), std::next(A.cols().begin(), nnz_loc),
-                 col_indices.begin(), [&global_indices](auto idx) -> int_t
-                 { return global_indices[idx]; });
+
+  if (bs[0] == 1 and bs[1] == 1)
+  {
+    std::vector<int_t> col_indices(nnz_loc_block);
+    std::transform(A.cols().begin(), std::next(A.cols().begin(), nnz_loc_block),
+                   col_indices.begin(), [&global_indices](auto idx) -> int_t
+                   { return global_indices[idx]; });
+    return col_indices;
+  }
+
+  std::vector<int_t> col_indices(nnz_loc_block * bs[0] * bs[1]);
+  const auto& A_cols = A.cols();
+  const auto& A_rowptr = A.row_ptr();
+  std::int64_t pos = 0;
+  for (std::int32_t i = 0; i < m_loc_block; ++i)
+  {
+    for (int i0 = 0; i0 < bs[0]; ++i0)
+    {
+      for (std::int64_t j = A_rowptr[i]; j < A_rowptr[i + 1]; ++j)
+      {
+        int_t col_block = global_indices[A_cols[j]];
+        for (int i1 = 0; i1 < bs[1]; ++i1)
+          col_indices[pos++] = col_block * bs[1] + i1;
+      }
+    }
+  }
   return col_indices;
 }
 //----------------------------------------------------------------------------
+// Expand MatrixCSR block row pointer to flattened row pointer.
 std::vector<int_t> row_indices(const auto& A)
 {
-  return std::vector<int_t>(
-      A.row_ptr().begin(),
-      std::next(A.row_ptr().begin(), A.num_owned_rows() + 1));
+  std::array<int, 2> bs = A.block_size();
+  std::int32_t m_loc_block = A.num_owned_rows();
+  const auto& A_rowptr = A.row_ptr();
+
+  if (bs[0] == 1 and bs[1] == 1)
+  {
+    return std::vector<int_t>(A_rowptr.begin(),
+                              std::next(A_rowptr.begin(), m_loc_block + 1));
+  }
+
+  // Write the per-scalar-row entry counts into `flattened_rowptr[1:]`, with
+  // each block-row contributing `bs[0]` copies.
+  std::vector<int_t> flattened_rowptr(m_loc_block * bs[0] + 1);
+  for (std::int32_t i = 0; i < m_loc_block; ++i)
+  {
+    int_t delta = (A_rowptr[i + 1] - A_rowptr[i]) * bs[1];
+    std::fill_n(std::next(flattened_rowptr.begin(), 1 + i * bs[0]), bs[0],
+                delta);
+  }
+  std::inclusive_scan(std::next(flattened_rowptr.begin()),
+                      flattened_rowptr.end(),
+                      std::next(flattened_rowptr.begin()));
+  return flattened_rowptr;
+}
+//----------------------------------------------------------------------------
+// Expand MatrixCSR block values to flattened CSR layout.
+template <typename T>
+std::vector<T> matrix_values(const MatrixCSR<T>& A)
+{
+  std::array<int, 2> bs = A.block_size();
+  std::int32_t m_loc_block = A.num_owned_rows();
+  std::int64_t nnz_loc_block = A.row_ptr().at(m_loc_block);
+
+  if (bs[0] == 1 and bs[1] == 1)
+  {
+    return std::vector<T>(A.values().begin(),
+                          std::next(A.values().begin(), nnz_loc_block));
+  }
+
+  std::vector<T> flattened_values(nnz_loc_block * bs[0] * bs[1]);
+  const auto& A_values = A.values();
+  const auto& A_rowptr = A.row_ptr();
+  std::int64_t pos = 0;
+  for (std::int32_t i = 0; i < m_loc_block; ++i)
+  {
+    for (int i0 = 0; i0 < bs[0]; ++i0)
+    {
+      for (std::int64_t j = A_rowptr[i]; j < A_rowptr[i + 1]; ++j)
+      {
+        for (int i1 = 0; i1 < bs[1]; ++i1)
+          flattened_values[pos++]
+              = A_values[j * bs[0] * bs[1] + i0 * bs[1] + i1];
+      }
+    }
+  }
+  return flattened_values;
 }
 //----------------------------------------------------------------------------
 template <typename T>
@@ -78,17 +156,18 @@ create_supermatrix(const auto& A, auto& A_mat_values, auto& rowptr, auto& cols)
 
   auto map0 = A.index_map(0);
   auto map1 = A.index_map(1);
+  std::array<int, 2> bs = A.block_size();
 
-  // Global size
-  std::int64_t m = map0->size_global();
-  std::int64_t n = map1->size_global();
+  // Global size (scalar, after block expansion)
+  std::int64_t m = map0->size_global() * bs[0];
+  std::int64_t n = map1->size_global() * bs[1];
   if (m != n)
     throw std::runtime_error("Cannot solve non-square system");
 
-  // Number of local rows, first row and local number of non-zeros
-  std::int32_t m_loc = A.num_owned_rows();
-  std::int64_t first_row = map0->local_range().front();
-  std::int64_t nnz_loc = A.row_ptr().at(m_loc);
+  // Number of local rows, first row and local number of non-zeros.
+  std::int32_t m_loc = A.num_owned_rows() * bs[0];
+  std::int64_t first_row = map0->local_range().front() * bs[0];
+  std::int64_t nnz_loc = A.row_ptr().at(A.num_owned_rows()) * bs[0] * bs[1];
 
   // Check values fit into upper range of int_t.
   auto check = [](std::int64_t x)
@@ -137,7 +216,7 @@ create_supermatrix(const auto& A, auto& A_mat_values, auto& rowptr, auto& cols)
 //----------------------------------------------------------------------------
 template <typename T>
 SuperLUDistMatrix<T>::SuperLUDistMatrix(const MatrixCSR<T>& A)
-    : _comm(A.comm()), _matA_values(A.values()),
+    : _comm(A.comm()), _matA_values(matrix_values(A)),
       _cols(std::make_unique<SuperLUDistStructs::vec_int_t>(col_indices(A))),
       _rowptr(std::make_unique<SuperLUDistStructs::vec_int_t>(row_indices(A))),
       _supermatrix(create_supermatrix<T>(A, _matA_values, *_rowptr, *_cols))
diff --git a/python/test/unit/la/test_superlu_dist.py b/python/test/unit/la/test_superlu_dist.py
@@ -11,6 +11,8 @@
 import pytest
 
 import dolfinx
+from dolfinx.common import IndexMap
+from dolfinx.cpp.la import SparsityPattern
 from dolfinx.fem import (
     Function,
     apply_lifting,
@@ -22,9 +24,9 @@
     functionspace,
     locate_dofs_topological,
 )
-from dolfinx.la import InsertMode
+from dolfinx.la import InsertMode, matrix_csr, vector
 from dolfinx.mesh import create_unit_square, exterior_facet_indices
-from ufl import SpatialCoordinate, TestFunction, TrialFunction, div, dx, grad, inner
+from ufl import SpatialCoordinate, TestFunction, TrialFunction, as_vector, div, dx, grad, inner
 
 
 @pytest.mark.parametrize("dtype", [np.float32, np.float64, np.complex128])
@@ -130,3 +132,105 @@ def solve_and_check(solver, b):
     solver_2.set_option("Fact", "SamePattern")
     uh_2 = solve_and_check(solver_2, b_2)
     check_error(u_ex, uh_2)
+
+
+@pytest.mark.parametrize("dtype", [np.float32, np.float64, np.complex128])
+@pytest.mark.skipif(not dolfinx.has_superlu_dist, reason="No SuperLU_DIST")
+def test_superlu_solver_blocked(dtype):
+    """Vector Poisson problem on a vector Lagrange space (block size 2)."""
+    from dolfinx.la.superlu_dist import superlu_dist_matrix, superlu_dist_solver
+
+    mesh_dtype = dtype().real.dtype
+    mesh = create_unit_square(MPI.COMM_WORLD, 5, 5, dtype=mesh_dtype)
+    V = functionspace(mesh, ("Lagrange", 3, (2,)))
+    u, v = TrialFunction(V), TestFunction(V)
+
+    a = form(inner(grad(u), grad(v)) * dx, dtype=dtype)
+
+    def u_ex(x):
+        return np.vstack((x[1] ** 3, x[0] ** 3))
+
+    x = SpatialCoordinate(mesh)
+    u_ex_ufl = as_vector((x[1] ** 3, x[0] ** 3))
+    f = -div(grad(u_ex_ufl))
+    L = form(inner(f, v) * dx, dtype=dtype)
+
+    u_bc = Function(V, dtype=dtype)
+    u_bc.interpolate(u_ex)
+
+    facetdim = mesh.topology.dim - 1
+    mesh.topology.create_connectivity(facetdim, mesh.topology.dim)
+    bndry_facets = exterior_facet_indices(mesh.topology)
+    bdofs = locate_dofs_topological(V, facetdim, bndry_facets)
+    bc = dirichletbc(u_bc, bdofs)
+
+    b = assemble_vector(L)
+    apply_lifting(b.array, [a], bcs=[[bc]])
+    b.scatter_reverse(InsertMode.add)
+    bc.set(b.array)
+
+    A = assemble_matrix(a, bcs=[bc])
+    A.scatter_reverse()
+    assert A.block_size == [2, 2]
+
+    A_superlu = superlu_dist_matrix(A)
+    solver = superlu_dist_solver(A_superlu)
+    solver.set_option("SymmetricMode", "YES")
+    uh = Function(V, dtype=dtype)
+    error_code = solver.solve(b, uh.x)
+    assert error_code == 0
+    uh.x.scatter_forward()
+
+    M = form(inner(u_ex_ufl - uh, u_ex_ufl - uh) * dx, dtype=dtype)
+    error = mesh.comm.allreduce(assemble_scalar(M), op=MPI.SUM)
+    eps = np.sqrt(np.finfo(dtype).eps)
+    assert np.isclose(error, 0.0, atol=eps)
+
+
+@pytest.mark.parametrize("dtype", [np.float32, np.float64, np.complex128])
+@pytest.mark.skipif(not dolfinx.has_superlu_dist, reason="No SuperLU_DIST")
+@pytest.mark.skipif(MPI.COMM_WORLD.size > 1, reason="Hand-built single-rank matrix")
+def test_superlu_solver_asymmetric_blocks(dtype):
+    """Hand-built MatrixCSR with bs[0] = 2 and bs[1] = 3 and final size 6 x 6."""
+    from dolfinx.la.superlu_dist import superlu_dist_matrix, superlu_dist_solver
+
+    bs0, bs1 = 2, 3
+    n_row_blocks, n_col_blocks = 3, 2
+
+    im_row = IndexMap(MPI.COMM_WORLD, n_row_blocks)
+    im_col = IndexMap(MPI.COMM_WORLD, n_col_blocks)
+    sp = SparsityPattern(MPI.COMM_WORLD, [im_row, im_col], [bs0, bs1])
+    for i in range(n_row_blocks):
+        for j in range(n_col_blocks):
+            sp.insert(i, j)
+    sp.finalize()
+
+    A = matrix_csr(sp, dtype=dtype)
+    assert A.block_size == [bs0, bs1]
+
+    rng = np.random.default_rng(0)
+    A_dense = (np.eye(6) * 10.0 + rng.standard_normal((6, 6))).astype(dtype)
+
+    for i in range(n_row_blocks):
+        for j in range(n_col_blocks):
+            block_idx = i * n_col_blocks + j
+            for i0 in range(bs0):
+                for i1 in range(bs1):
+                    A.data[block_idx * bs0 * bs1 + i0 * bs1 + i1] = A_dense[
+                        i * bs0 + i0, j * bs1 + i1
+                    ]
+
+    b_np = rng.standard_normal(6).astype(dtype)
+    x_expected = np.linalg.solve(A_dense, b_np)
+
+    b = vector(im_row, bs=bs0, dtype=dtype)
+    b.array[:] = b_np
+    u = vector(im_col, bs=bs1, dtype=dtype)
+
+    A_superlu = superlu_dist_matrix(A)
+    solver = superlu_dist_solver(A_superlu)
+    error_code = solver.solve(b, u)
+    assert error_code == 0
+    u.scatter_forward()
+
+    assert np.allclose(u.array, x_expected)