Constrain backslash dispatch by solver type in backend

Sebastien Loisel · Sebastien Loisel · commit 85e7c3500b14 · 2026-01-10T22:30:02.000Z
Fix type dispatch so CUDA solves use cuDSS instead of MUMPS:
- Generic \, lu, ldlt now require HPCBackend{D,C,SolverMUMPS}
- CUDA extension \, lu, ldlt now require CuDSSBackend (SolverCuDSS)
- Add CuDSSBackend{C} type alias for cuDSS-specific backends
- Right division operators also constrained to MUMPS backends
- Remove unused comm_barrier function
diff --git a/ext/HPCLinearAlgebraCUDAExt.jl b/ext/HPCLinearAlgebraCUDAExt.jl
@@ -28,7 +28,7 @@ using LinearAlgebra
 using HPCLinearAlgebra: HPCBackend, DeviceCPU, DeviceCUDA, DeviceMetal,
                         CommSerial, CommMPI, AbstractComm, AbstractDevice,
                         SolverMUMPS, AbstractSolverCuDSS,
-                        comm_rank, comm_size, comm_barrier
+                        comm_rank, comm_size
 
 # Type aliases for convenience
 const CuBackend{C,S} = HPCLinearAlgebra.HPCBackend{HPCLinearAlgebra.DeviceCUDA, C, S}
@@ -47,6 +47,9 @@ cuDSS sparse direct solver for CUDA GPUs.
 """
 struct SolverCuDSS <: HPCLinearAlgebra.AbstractSolverCuDSS end
 
+# Type alias for cuDSS-specific backends (constrains solver type to SolverCuDSS)
+const CuDSSBackend{C} = HPCLinearAlgebra.HPCBackend{HPCLinearAlgebra.DeviceCUDA, C, SolverCuDSS}
+
 # ============================================================================
 # Pre-constructed Backend Constants
 # ============================================================================
@@ -549,29 +552,33 @@ end
 # ============================================================================
 
 """
-    lu(A::HPCSparseMatrix{T,Ti,<:CuBackend})
+    lu(A::HPCSparseMatrix{T,Ti,<:CuDSSBackend})
 
 Compute LU factorization of a GPU sparse matrix using cuDSS.
 Returns a CuDSSFactorizationMPI that can be used with `F \\ b`.
 
 If a previous factorization with the same sparsity structure exists,
 the cached analysis (permutation + elimination tree) is reused,
 skipping the expensive reordering phase.
+
+Note: This method is specific to cuDSS backends (SolverCuDSS).
 """
-function LinearAlgebra.lu(A::HPCLinearAlgebra.HPCSparseMatrix{T,Ti,<:CuBackend}) where {T,Ti}
+function LinearAlgebra.lu(A::HPCLinearAlgebra.HPCSparseMatrix{T,Ti,<:CuDSSBackend}) where {T,Ti}
     return _create_cudss_factorization(A, false)
 end
 
 """
-    ldlt(A::HPCSparseMatrix{T,Ti,<:CuBackend})
+    ldlt(A::HPCSparseMatrix{T,Ti,<:CuDSSBackend})
 
 Compute LDLT factorization of a symmetric positive definite GPU sparse matrix.
 Returns a CuDSSFactorizationMPI that can be used with `F \\ b`.
 
 If a previous factorization with the same sparsity structure exists,
 the cached analysis (permutation + elimination tree) is reused.
+
+Note: This method is specific to cuDSS backends (SolverCuDSS).
 """
-function LinearAlgebra.ldlt(A::HPCLinearAlgebra.HPCSparseMatrix{T,Ti,<:CuBackend}) where {T,Ti}
+function LinearAlgebra.ldlt(A::HPCLinearAlgebra.HPCSparseMatrix{T,Ti,<:CuDSSBackend}) where {T,Ti}
     return _create_cudss_factorization(A, true)
 end
 
@@ -693,12 +700,14 @@ _get_mpi_comm_for_nccl(c::HPCLinearAlgebra.CommMPI) = c.comm
 _get_mpi_comm_for_nccl(::HPCLinearAlgebra.CommSerial) = error("cuDSS MGMN mode requires MPI communication (CommMPI), not CommSerial")
 
 """
-    solve(F::CuDSSFactorizationMPI{T,B}, b::HPCVector{T,<:CuBackend}) where {T,B}
+    solve(F::CuDSSFactorizationMPI{T,B}, b::HPCVector{T,<:CuDSSBackend}) where {T,B}
 
 Solve the linear system using the cuDSS factorization.
 This is solve-only - no refactorization is performed.
+
+Note: This method is specific to cuDSS backends (SolverCuDSS).
 """
-function HPCLinearAlgebra.solve(F::CuDSSFactorizationMPI{T,B}, b::HPCLinearAlgebra.HPCVector{T,<:CuBackend}) where {T,B}
+function HPCLinearAlgebra.solve(F::CuDSSFactorizationMPI{T,B}, b::HPCLinearAlgebra.HPCVector{T,<:CuDSSBackend}) where {T,B}
     comm = F.backend.comm
 
     # Copy b directly to RHS buffer (GPU to GPU)
@@ -712,11 +721,13 @@ function HPCLinearAlgebra.solve(F::CuDSSFactorizationMPI{T,B}, b::HPCLinearAlgeb
 end
 
 """
-    \\(F::CuDSSFactorizationMPI{T,B}, b::HPCVector{T,<:CuBackend}) where {T,B}
+    \\(F::CuDSSFactorizationMPI{T,B}, b::HPCVector{T,<:CuDSSBackend}) where {T,B}
 
 Solve the linear system using backslash notation (solve-only, no refactorization).
+
+Note: This method is specific to cuDSS backends (SolverCuDSS).
 """
-function Base.:\(F::CuDSSFactorizationMPI{T,B}, b::HPCLinearAlgebra.HPCVector{T,<:CuBackend}) where {T,B}
+function Base.:\(F::CuDSSFactorizationMPI{T,B}, b::HPCLinearAlgebra.HPCVector{T,<:CuDSSBackend}) where {T,B}
     return HPCLinearAlgebra.solve(F, b)
 end
 
@@ -765,14 +776,16 @@ end
 # 3. The cudss matrix wrapper points to our values buffer - we update it in place
 
 """
-    _refactorize_and_solve!(F::CuDSSFactorizationMPI{T,B}, A::HPCSparseMatrix{T,Ti,B}, b::HPCVector{T,B}) where {T,Ti,B}
+    _refactorize_and_solve!(F::CuDSSFactorizationMPI{T,B}, A::HPCSparseMatrix{T,Ti,B}, b::HPCVector{T,B}) where {T,Ti,B<:CuDSSBackend}
 
 Update the values in a cached factorization, refactorize (skip analysis), and solve.
 Returns the solution vector.
+
+Note: This method is specific to cuDSS backends (SolverCuDSS).
 """
 function _refactorize_and_solve!(F::CuDSSFactorizationMPI{T,B},
                                   A::HPCLinearAlgebra.HPCSparseMatrix{T,Ti,B},
-                                  b::HPCLinearAlgebra.HPCVector{T,B}) where {T,Ti,B<:CuBackend}
+                                  b::HPCLinearAlgebra.HPCVector{T,B}) where {T,Ti,B<:CuDSSBackend}
     comm = F.backend.comm
 
     # Update values in the GPU buffer (the cudss matrix wrapper points to this)
@@ -794,17 +807,19 @@ function _refactorize_and_solve!(F::CuDSSFactorizationMPI{T,B},
 end
 
 """
-    \\(A::HPCSparseMatrix{T,Ti,B}, b::HPCVector{T,B}) where {T,Ti,B<:CuBackend}
+    \\(A::HPCSparseMatrix{T,Ti,B}, b::HPCVector{T,B}) where {T,Ti,B<:CuDSSBackend}
 
 Solve A*x = b using cuDSS with analysis caching.
 
 First call for a given sparsity pattern: full analysis + factorization.
 Subsequent calls with same pattern: refactorize only (skip expensive analysis).
 
 The cuDSS data object is cached globally and reused - never destroyed.
+
+Note: This method is specific to cuDSS backends (SolverCuDSS).
 """
 function Base.:\(A::HPCLinearAlgebra.HPCSparseMatrix{T,Ti,B},
-                 b::HPCLinearAlgebra.HPCVector{T,B}) where {T,Ti,B<:CuBackend}
+                 b::HPCLinearAlgebra.HPCVector{T,B}) where {T,Ti,B<:CuDSSBackend}
     structural_hash = HPCLinearAlgebra._ensure_hash(A)
     cache_key = (structural_hash, false, T)  # false = not symmetric (LU)
 
@@ -826,12 +841,14 @@ function Base.:\(A::HPCLinearAlgebra.HPCSparseMatrix{T,Ti,B},
 end
 
 """
-    \\(A::Symmetric{T,<:HPCSparseMatrix{T,Ti,B}}, b::HPCVector{T,B}) where {T,Ti,B<:CuBackend}
+    \\(A::Symmetric{T,<:HPCSparseMatrix{T,Ti,B}}, b::HPCVector{T,B}) where {T,Ti,B<:CuDSSBackend}
 
 Solve A*x = b for a symmetric matrix using LDLT with analysis caching.
+
+Note: This method is specific to cuDSS backends (SolverCuDSS).
 """
 function Base.:\(A::Symmetric{T,<:HPCLinearAlgebra.HPCSparseMatrix{T,Ti,B}},
-                 b::HPCLinearAlgebra.HPCVector{T,B}) where {T,Ti,B<:CuBackend}
+                 b::HPCLinearAlgebra.HPCVector{T,B}) where {T,Ti,B<:CuDSSBackend}
     A_inner = parent(A)
     structural_hash = HPCLinearAlgebra._ensure_hash(A_inner)
     cache_key = (structural_hash, true, T)  # true = symmetric (LDLT)
diff --git a/src/HPCLinearAlgebra.jl b/src/HPCLinearAlgebra.jl
@@ -33,7 +33,7 @@ export HPCBackendCPU, HPCBackendMetal, HPCBackendCUDA
 export backend_cpu_serial, backend_cpu_mpi, backend_metal_mpi, backend_cuda_serial, backend_cuda_mpi
 export BACKEND_CPU_SERIAL, BACKEND_CPU_MPI  # Pre-constructed CPU backend constants
 # CUDA backends: use backend_cuda_serial() and backend_cuda_mpi() after loading CUDA
-export comm_rank, comm_size, comm_barrier
+export comm_rank, comm_size
 export array_type, matrix_type
 export backends_compatible, assert_backends_compatible
 
@@ -588,38 +588,50 @@ end
 # ============================================================================
 
 """
-    Base.:\\(A::HPCSparseMatrix{T}, b::HPCVector{T}) where T
+    Base.:\\(A::HPCSparseMatrix{T,Ti,HPCBackend{D,C,SolverMUMPS}}, b::HPCVector{T,HPCBackend{D,C,SolverMUMPS}}) where {T,Ti,D,C}
 
-Solve A*x = b using LU factorization.
+Solve A*x = b using LU factorization with MUMPS.
 For symmetric matrices, use `Symmetric(A) \\ b` to use the faster LDLT factorization.
 For repeated solves, compute the factorization once with `lu(A)` or `ldlt(A)`.
+
+Note: This method is specific to MUMPS backends. GPU backends (cuDSS) have their own
+specialized backslash methods defined in the CUDA extension.
 """
-function Base.:\(A::HPCSparseMatrix{T}, b::HPCVector{T}) where T
+function Base.:\(A::HPCSparseMatrix{T,Ti,HPCBackend{D,C,SolverMUMPS}},
+                 b::HPCVector{T,HPCBackend{D,C,SolverMUMPS}}) where {T,Ti,D,C}
     F = LinearAlgebra.lu(A)
     x = F \ b
     finalize!(F)
     return x
 end
 
 """
-    Base.:\\(A::Symmetric{T,<:HPCSparseMatrix{T}}, b::HPCVector{T}) where T
+    Base.:\\(A::Symmetric{T,<:HPCSparseMatrix{T,Ti,HPCBackend{D,C,SolverMUMPS}}}, b::HPCVector{T,HPCBackend{D,C,SolverMUMPS}}) where {T,Ti,D,C}
 
-Solve A*x = b for a symmetric matrix using LDLT (no symmetry check needed).
+Solve A*x = b for a symmetric matrix using LDLT with MUMPS (no symmetry check needed).
 Use `Symmetric(A)` to wrap a known-symmetric matrix and skip the expensive symmetry check.
+
+Note: This method is specific to MUMPS backends. GPU backends (cuDSS) have their own
+specialized backslash methods defined in the CUDA extension.
 """
-function Base.:\(A::Symmetric{T,<:HPCSparseMatrix{T}}, b::HPCVector{T}) where T
+function Base.:\(A::Symmetric{T,<:HPCSparseMatrix{T,Ti,HPCBackend{D,C,SolverMUMPS}}},
+                 b::HPCVector{T,HPCBackend{D,C,SolverMUMPS}}) where {T,Ti,D,C}
     F = LinearAlgebra.ldlt(parent(A))
     x = F \ b
     finalize!(F)
     return x
 end
 
 """
-    Base.:\\(At::Transpose{T,<:HPCSparseMatrix{T}}, b::HPCVector{T}) where T
+    Base.:\\(At::Transpose{T,<:HPCSparseMatrix{T,Ti,HPCBackend{D,C,SolverMUMPS}}}, b::HPCVector{T,HPCBackend{D,C,SolverMUMPS}}) where {T,Ti,D,C}
+
+Solve transpose(A)*x = b using LU factorization with MUMPS.
 
-Solve transpose(A)*x = b using LU factorization.
+Note: This method is specific to MUMPS backends. GPU backends (cuDSS) have their own
+specialized backslash methods defined in the CUDA extension.
 """
-function Base.:\(At::Transpose{T,<:HPCSparseMatrix{T}}, b::HPCVector{T}) where T
+function Base.:\(At::Transpose{T,<:HPCSparseMatrix{T,Ti,HPCBackend{D,C,SolverMUMPS}}},
+                 b::HPCVector{T,HPCBackend{D,C,SolverMUMPS}}) where {T,Ti,D,C}
     A_t = HPCSparseMatrix(At)
     F = LinearAlgebra.lu(A_t)
     x = F \ b
@@ -635,23 +647,29 @@ end
 # For row vectors: transpose(v) / A solves x * A = transpose(v)
 
 """
-    Base.:/(vt::Transpose{T,HPCVector{T}}, A::HPCSparseMatrix{T}) where T
+    Base.:/(vt::Transpose{T,HPCVector{T,HPCBackend{D,C,SolverMUMPS}}}, A::HPCSparseMatrix{T,Ti,HPCBackend{D,C,SolverMUMPS}}) where {T,Ti,D,C}
 
 Solve x * A = transpose(v), returning x as a transposed HPCVector.
 Equivalent to transpose(transpose(A) \\ v).
+
+Note: This method is specific to MUMPS backends.
 """
-function Base.:/(vt::Transpose{T,HPCVector{T}}, A::HPCSparseMatrix{T}) where T
+function Base.:/(vt::Transpose{T,HPCVector{T,HPCBackend{D,C,SolverMUMPS}}},
+                 A::HPCSparseMatrix{T,Ti,HPCBackend{D,C,SolverMUMPS}}) where {T,Ti,D,C}
     v = vt.parent
     x = transpose(A) \ v
     return transpose(x)
 end
 
 """
-    Base.:/(vt::Transpose{T,HPCVector{T}}, At::Transpose{T,<:HPCSparseMatrix{T}}) where T
+    Base.:/(vt::Transpose{T,HPCVector{T,HPCBackend{D,C,SolverMUMPS}}}, At::Transpose{T,<:HPCSparseMatrix{T,Ti,HPCBackend{D,C,SolverMUMPS}}}) where {T,Ti,D,C}
 
 Solve x * transpose(A) = transpose(v), returning x as a transposed HPCVector.
+
+Note: This method is specific to MUMPS backends.
 """
-function Base.:/(vt::Transpose{T,HPCVector{T}}, At::Transpose{T,<:HPCSparseMatrix{T}}) where T
+function Base.:/(vt::Transpose{T,HPCVector{T,HPCBackend{D,C,SolverMUMPS}}},
+                 At::Transpose{T,<:HPCSparseMatrix{T,Ti,HPCBackend{D,C,SolverMUMPS}}}) where {T,Ti,D,C}
     v = vt.parent
     A = At.parent
     x = A \ v
diff --git a/src/backends.jl b/src/backends.jl
@@ -297,14 +297,6 @@ function comm_waitall(::CommMPI, requests)
     end
 end
 
-"""
-    comm_barrier(comm::AbstractComm)
-
-Synchronization barrier. For CommSerial, this is a no-op.
-"""
-comm_barrier(::CommSerial) = nothing
-comm_barrier(c::CommMPI) = MPI.Barrier(c.comm)
-
 # ============================================================================
 # HPCBackend Factory Functions
 # ============================================================================
diff --git a/src/mumps_factorization.jl b/src/mumps_factorization.jl
@@ -469,23 +469,29 @@ end
 # ============================================================================
 
 """
-    LinearAlgebra.lu(A::HPCSparseMatrix{T,Ti,B}) where {T,Ti,B}
+    LinearAlgebra.lu(A::HPCSparseMatrix{T,Ti,HPCBackend{D,C,SolverMUMPS}}) where {T,Ti,D,C}
 
 Compute LU factorization of a distributed sparse matrix using MUMPS.
 Returns a `MUMPSFactorization` for use with `\\` or `solve`.
+
+Note: This method is specific to MUMPS backends. GPU backends (cuDSS) define their own
+lu method in the CUDA extension.
 """
-function LinearAlgebra.lu(A::HPCSparseMatrix{T,Ti,B}) where {T,Ti,B}
+function LinearAlgebra.lu(A::HPCSparseMatrix{T,Ti,HPCBackend{D,C,SolverMUMPS}}) where {T,Ti,D,C}
     return _create_mumps_factorization(A, false)
 end
 
 """
-    LinearAlgebra.ldlt(A::HPCSparseMatrix{T,Ti,B}) where {T,Ti,B}
+    LinearAlgebra.ldlt(A::HPCSparseMatrix{T,Ti,HPCBackend{D,C,SolverMUMPS}}) where {T,Ti,D,C}
 
 Compute LDLT factorization of a distributed symmetric sparse matrix using MUMPS.
 The matrix must be symmetric; only the lower triangular part is used.
 Returns a `MUMPSFactorization` for use with `\\` or `solve`.
+
+Note: This method is specific to MUMPS backends. GPU backends (cuDSS) define their own
+ldlt method in the CUDA extension.
 """
-function LinearAlgebra.ldlt(A::HPCSparseMatrix{T,Ti,B}) where {T,Ti,B}
+function LinearAlgebra.ldlt(A::HPCSparseMatrix{T,Ti,HPCBackend{D,C,SolverMUMPS}}) where {T,Ti,D,C}
     return _create_mumps_factorization(A, true)
 end