Unify CPU/GPU code paths, remove synchronize calls, reset version

Sebastien Loisel · Sebastien Loisel · commit 4ca3c32515e6 · 2026-01-10T20:36:27.000Z
- Remove isa-based CPU/GPU branching in blocks.jl, dense.jl,
  mumps_factorization.jl; use unified helpers (_convert_array,
  _ensure_cpu, to_backend) instead
- Remove CUDA.synchronize() and Metal.synchronize() from kernel
  dispatches and cuDSS operations (implicit sync on reads)
- Remove comm_barrier() calls from cuDSS collective operations
- Add _array_to_device stub to main module for GPU extensions
- Update codecov.yml with correct extension filenames
- Reset version to 0.1.0 for registry re-registration
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 authors = ["S. Loisel"]
 name = "HPCLinearAlgebra"
 uuid = "537374f1-5608-4525-82fb-641dce542540"
-version = "0.1.9"
+version = "0.1.0"
 
 [compat]
 Adapt = "4"
diff --git a/codecov.yml b/codecov.yml
@@ -14,5 +14,5 @@ coverage:
 
 ignore:
   - "ext/"
-  - "ext/LinearAlgebraMPICUDAExt.jl"
-  - "ext/LinearAlgebraMPIMetalExt.jl"
+  - "ext/HPCLinearAlgebraCUDAExt.jl"
+  - "ext/HPCLinearAlgebraMetalExt.jl"
diff --git a/ext/HPCLinearAlgebraCUDAExt.jl b/ext/HPCLinearAlgebraCUDAExt.jl
@@ -667,19 +667,13 @@ function _create_cudss_factorization(A::HPCLinearAlgebra.HPCSparseMatrix{T,Ti,B}
     rhs = rhs_ref[]
     _cudss_matrix_set_distribution_row1d(rhs, Int64(first_row), Int64(last_row))
 
-    comm_barrier(comm)
-
     # Run analysis phase
     # Note: Analysis caching is disabled in MGMN mode because cudssDataGet for
     # PERM_REORDER_ROW/COL returns INVALID_VALUE. Each factorization does full analysis.
     _cudss_execute(handle, CUDSS_PHASE_ANALYSIS, config, data, matrix, solution, rhs)
-    CUDA.synchronize()
-    comm_barrier(comm)
 
     # Run numeric factorization
     _cudss_execute(handle, CUDSS_PHASE_FACTORIZATION, config, data, matrix, solution, rhs)
-    CUDA.synchronize()
-    comm_barrier(comm)
 
     # Create factorization object
     F = CuDSSFactorizationMPI{T,B}(
@@ -710,10 +704,8 @@ function HPCLinearAlgebra.solve(F::CuDSSFactorizationMPI{T,B}, b::HPCLinearAlgeb
     # Copy b directly to RHS buffer (GPU to GPU)
     copyto!(F.b_gpu, b.v)
 
-    # Execute solve phase only
+    # Execute solve phase only (collective operation)
     _cudss_execute(F.handle, CUDSS_PHASE_SOLVE, F.config, F.data, F.matrix, F.solution, F.rhs)
-    CUDA.synchronize()
-    comm_barrier(comm)
 
     # Return GPU vector (copy from internal buffer) with backend
     return HPCLinearAlgebra.HPCVector{T,B}(b.structural_hash, b.partition, copy(F.x_gpu), F.backend)
@@ -736,10 +728,6 @@ Only destroys: data (L/U factors), matrix wrappers.
 Does NOT destroy: handle, config (global, cached per-process).
 """
 function HPCLinearAlgebra.finalize!(F::CuDSSFactorizationMPI)
-    comm = F.backend.comm
-    CUDA.synchronize()
-    comm_barrier(comm)
-
     # Destroy data object (holds L/U factors - collective operation in MGMN mode)
     if F.data != C_NULL
         _cudss_data_destroy(F.handle, F.data)
@@ -760,9 +748,6 @@ function HPCLinearAlgebra.finalize!(F::CuDSSFactorizationMPI)
         F.rhs = C_NULL
     end
 
-    CUDA.synchronize()
-    comm_barrier(comm)
-
     return nothing
 end
 
@@ -798,18 +783,11 @@ function _refactorize_and_solve!(F::CuDSSFactorizationMPI{T,B},
     # Copy RHS to buffer
     copyto!(F.b_gpu, b.v)
 
-    CUDA.synchronize()
-    comm_barrier(comm)
-
     # Refactorize (skip analysis - the symbolic factorization is already done)
     _cudss_execute(F.handle, CUDSS_PHASE_FACTORIZATION, F.config, F.data, F.matrix, F.solution, F.rhs)
-    CUDA.synchronize()
-    comm_barrier(comm)
 
     # Solve
     _cudss_execute(F.handle, CUDSS_PHASE_SOLVE, F.config, F.data, F.matrix, F.solution, F.rhs)
-    CUDA.synchronize()
-    comm_barrier(comm)
 
     # Return GPU vector (copy from internal buffer) with backend
     return HPCLinearAlgebra.HPCVector{T, B}(b.structural_hash, b.partition, copy(F.x_gpu), b.backend)
@@ -839,12 +817,9 @@ function Base.:\(A::HPCLinearAlgebra.HPCSparseMatrix{T,Ti,B},
         F = _create_cudss_factorization(A, false)
         _cudss_backslash_cache[cache_key] = F
 
-        # Solve
-        comm = A.backend.comm
+        # Solve (collective operation)
         copyto!(F.b_gpu, b.v)
         _cudss_execute(F.handle, CUDSS_PHASE_SOLVE, F.config, F.data, F.matrix, F.solution, F.rhs)
-        CUDA.synchronize()
-        comm_barrier(comm)
 
         return HPCLinearAlgebra.HPCVector{T, B}(b.structural_hash, b.partition, copy(F.x_gpu), b.backend)
     end
@@ -870,12 +845,9 @@ function Base.:\(A::Symmetric{T,<:HPCLinearAlgebra.HPCSparseMatrix{T,Ti,B}},
         F = _create_cudss_factorization(A_inner, true)
         _cudss_backslash_cache[cache_key] = F
 
-        # Solve
-        comm = A_inner.backend.comm
+        # Solve (collective operation)
         copyto!(F.b_gpu, b.v)
         _cudss_execute(F.handle, CUDSS_PHASE_SOLVE, F.config, F.data, F.matrix, F.solution, F.rhs)
-        CUDA.synchronize()
-        comm_barrier(comm)
 
         return HPCLinearAlgebra.HPCVector{T, B}(b.structural_hash, b.partition, copy(F.x_gpu), b.backend)
     end
@@ -928,7 +900,6 @@ function _cuda_map_rows_kernel_dispatch(f, output::CuMatrix{T}, arg1::CuMatrix{T
     threads = min(n, config.threads)
     blocks = cld(n, threads)
     kernel(f, output, arg1, Val(ncols1), Val(out_cols); threads=threads, blocks=blocks)
-    CUDA.synchronize()
 end
 
 function _cuda_map_rows_kernel_dispatch(f, output::CuMatrix{T}, arg1::CuMatrix{T}, arg2::CuMatrix{T}) where T
@@ -942,7 +913,6 @@ function _cuda_map_rows_kernel_dispatch(f, output::CuMatrix{T}, arg1::CuMatrix{T
     threads = min(n, config.threads)
     blocks = cld(n, threads)
     kernel(f, output, arg1, arg2, Val(ncols1), Val(ncols2), Val(out_cols); threads=threads, blocks=blocks)
-    CUDA.synchronize()
 end
 
 # CUDA kernels
diff --git a/ext/HPCLinearAlgebraMetalExt.jl b/ext/HPCLinearAlgebraMetalExt.jl
@@ -153,7 +153,6 @@ function _map_rows_kernel_dispatch(f, output::MtlMatrix{T}, arg1::MtlMatrix{T})
     threads = min(n, 256)
     groups = cld(n, threads)
     kernel(f, output, arg1, Val(ncols1), Val(out_cols); threads=threads, groups=groups)
-    Metal.synchronize()
 end
 
 function _map_rows_kernel_dispatch(f, output::MtlMatrix{T}, arg1::MtlMatrix{T}, arg2::MtlMatrix{T}) where T
@@ -166,7 +165,6 @@ function _map_rows_kernel_dispatch(f, output::MtlMatrix{T}, arg1::MtlMatrix{T},
     threads = min(n, 256)
     groups = cld(n, threads)
     kernel(f, output, arg1, arg2, Val(ncols1), Val(ncols2), Val(out_cols); threads=threads, groups=groups)
-    Metal.synchronize()
 end
 
 function _map_rows_kernel_dispatch(f, output::MtlMatrix{T}, arg1::MtlMatrix{T}, arg2::MtlMatrix{T}, arg3::MtlMatrix{T}) where T
@@ -180,7 +178,6 @@ function _map_rows_kernel_dispatch(f, output::MtlMatrix{T}, arg1::MtlMatrix{T},
     threads = min(n, 256)
     groups = cld(n, threads)
     kernel(f, output, arg1, arg2, arg3, Val(ncols1), Val(ncols2), Val(ncols3), Val(out_cols); threads=threads, groups=groups)
-    Metal.synchronize()
 end
 
 function _map_rows_kernel_dispatch(f, output::MtlMatrix{T}, arg1::MtlMatrix{T}, arg2::MtlMatrix{T}, arg3::MtlMatrix{T}, arg4::MtlMatrix{T}) where T
@@ -195,7 +192,6 @@ function _map_rows_kernel_dispatch(f, output::MtlMatrix{T}, arg1::MtlMatrix{T},
     threads = min(n, 256)
     groups = cld(n, threads)
     kernel(f, output, arg1, arg2, arg3, arg4, Val(ncols1), Val(ncols2), Val(ncols3), Val(ncols4), Val(out_cols); threads=threads, groups=groups)
-    Metal.synchronize()
 end
 
 function _map_rows_kernel_dispatch(f, output::MtlMatrix{T}, arg1::MtlMatrix{T}, arg2::MtlMatrix{T}, arg3::MtlMatrix{T}, arg4::MtlMatrix{T}, arg5::MtlMatrix{T}) where T
@@ -211,7 +207,6 @@ function _map_rows_kernel_dispatch(f, output::MtlMatrix{T}, arg1::MtlMatrix{T},
     threads = min(n, 256)
     groups = cld(n, threads)
     kernel(f, output, arg1, arg2, arg3, arg4, arg5, Val(ncols1), Val(ncols2), Val(ncols3), Val(ncols4), Val(ncols5), Val(out_cols); threads=threads, groups=groups)
-    Metal.synchronize()
 end
 
 # ============================================================================
diff --git a/src/HPCLinearAlgebra.jl b/src/HPCLinearAlgebra.jl
@@ -170,6 +170,7 @@ const _identity_addition_plan_cache = Dict{Tuple{Blake3Hash,DataType,DataType},A
 # Forward declarations - implementations are after include("backends.jl")
 function _convert_array end
 function to_backend end
+function _array_to_device end  # GPU extensions provide CPU→GPU conversion
 
 """
     clear_plan_cache!()
diff --git a/src/blocks.jl b/src/blocks.jl
@@ -145,18 +145,8 @@ function Base.cat(As::HPCSparseMatrix{T,Ti,Bk}...; dims) where {T,Ti,Bk<:HPCBack
 
     result = HPCSparseMatrix_local(transpose(AT_local), backend)
 
-    # Convert to GPU if inputs were GPU (GPU→CPU for MPI, then CPU→GPU for result)
-    device = backend.device
-    if !(device isa DeviceCPU)
-        nzval_target = copyto!(similar(As[1].nzval, length(result.nzval)), result.nzval)
-        rowptr_target = _to_target_device(result.rowptr, device)
-        colval_target = _to_target_device(result.colval, device)
-        return HPCSparseMatrix{T,Ti,Bk}(
-            result.structural_hash, result.row_partition, result.col_partition, result.col_indices,
-            result.rowptr, result.colval, nzval_target, result.nrows_local, result.ncols_compressed,
-            nothing, result.cached_symmetric, rowptr_target, colval_target, backend)
-    end
-    return result
+    # Convert to target backend (no-op for CPU, copies for GPU)
+    return to_backend(result, backend)
 end
 
 # ============================================================================
@@ -292,12 +282,8 @@ function Base.cat(As::HPCMatrix{T,B}...; dims) where {T, B<:HPCBackend}
     # Step 4: Create HPCMatrix from local data
     result = HPCMatrix_local(local_matrix, backend)
 
-    # Convert to GPU if inputs were GPU (check if backend device is not CPU)
-    if !(backend.device isa DeviceCPU)
-        local_matrix_gpu = copyto!(similar(As[1].A, local_nrows, total_cols), local_matrix)
-        return HPCMatrix{T,B}(result.structural_hash, result.row_partition, result.col_partition, local_matrix_gpu, backend)
-    end
-    return result
+    # Convert to target backend (no-op for CPU, copies for GPU)
+    return to_backend(result, backend)
 end
 
 Base.hcat(As::HPCMatrix...) = cat(As...; dims=2)
@@ -483,7 +469,6 @@ function blockdiag(As::HPCSparseMatrix{T,Ti,Bk}...) where {T,Ti,Bk<:HPCBackend}
 
     backend = As[1].backend
     comm = backend.comm
-    device = backend.device
     rank = comm_rank(comm)
     nranks = comm_size(comm)
 
@@ -555,15 +540,6 @@ function blockdiag(As::HPCSparseMatrix{T,Ti,Bk}...) where {T,Ti,Bk<:HPCBackend}
 
     result = HPCSparseMatrix_local(transpose(AT_local), backend)
 
-    # Convert to GPU if inputs were GPU (GPU→CPU for MPI, then CPU→GPU for result)
-    if !(device isa DeviceCPU)
-        nzval_target = copyto!(similar(As[1].nzval, length(result.nzval)), result.nzval)
-        rowptr_target = _to_target_device(result.rowptr, device)
-        colval_target = _to_target_device(result.colval, device)
-        return HPCSparseMatrix{T,Ti,Bk}(
-            result.structural_hash, result.row_partition, result.col_partition, result.col_indices,
-            result.rowptr, result.colval, nzval_target, result.nrows_local, result.ncols_compressed,
-            nothing, result.cached_symmetric, rowptr_target, colval_target, backend)
-    end
-    return result
+    # Convert to target backend (no-op for CPU, copies for GPU)
+    return to_backend(result, backend)
 end
diff --git a/src/dense.jl b/src/dense.jl
@@ -618,16 +618,8 @@ function LinearAlgebra.mul!(y::HPCVector{T,B}, A::HPCMatrix{T,B}, x::HPCVector{T
     plan = get_dense_vector_plan(A, x)
     execute_plan!(plan, x)
 
-    # Check if CPU or GPU based on device type
-    if A.backend.device isa DeviceCPU
-        # CPU path
-        y_local_cpu = Vector{T}(undef, length(y.v))
-        LinearAlgebra.mul!(y_local_cpu, A.A, plan.gathered_cpu)
-        copyto!(y.v, y_local_cpu)
-    else
-        # GPU path
-        LinearAlgebra.mul!(y.v, A.A, plan.gathered)
-    end
+    # Unified CPU/GPU path: plan.gathered has correct type after execute_plan!
+    LinearAlgebra.mul!(y.v, A.A, plan.gathered)
     return y
 end
 
@@ -653,16 +645,9 @@ function Base.:*(A::HPCMatrix{T,B}, x::HPCVector{T,B}) where {T,B<:HPCBackend}
     # Execute the plan to gather vector elements
     execute_plan!(plan, x)
 
-    # Check if CPU or GPU based on device type
-    if A.backend.device isa DeviceCPU
-        # CPU path
-        y_v = Vector{T}(undef, local_rows)
-        LinearAlgebra.mul!(y_v, A.A, plan.gathered_cpu)
-    else
-        # GPU path
-        y_v = similar(A.A, local_rows)
-        LinearAlgebra.mul!(y_v, A.A, plan.gathered)
-    end
+    # Unified CPU/GPU path: similar() preserves array type, plan.gathered has correct type
+    y_v = similar(A.A, local_rows)
+    LinearAlgebra.mul!(y_v, A.A, plan.gathered)
 
     return HPCVector{T,B}(
         plan.result_partition_hash,
@@ -936,13 +921,9 @@ function execute_plan!(plan::DenseTransposePlan{T}, A::HPCMatrix{T,B}) where {T,
             plan.row_partition, plan.col_partition, size(result_AT), comm)
     end
 
-    # Convert result to match input array type (CPU or GPU)
-    if !(A.backend.device isa DeviceCPU)
-        # Input was GPU - convert result back to GPU
-        result_AT_gpu = copyto!(similar(A.A, size(result_AT)), result_AT)
-        return HPCMatrix{T,B}(plan.structural_hash, plan.row_partition, plan.col_partition, result_AT_gpu, A.backend)
-    end
-    return HPCMatrix{T,B}(plan.structural_hash, plan.row_partition, plan.col_partition, result_AT, A.backend)
+    # Unified CPU/GPU path: _convert_array is no-op for CPU, copies for GPU
+    result_A = _convert_array(result_AT, A.backend.device)
+    return HPCMatrix{T,B}(plan.structural_hash, plan.row_partition, plan.col_partition, result_A, A.backend)
 end
 
 """
@@ -1249,19 +1230,14 @@ function Base.:*(At::Transpose{T,HPCMatrix{T,B}}, x::HPCVector{T,B}) where {T,B<
     my_row_start = A.row_partition[rank+1]
     my_row_end = A.row_partition[rank+2] - 1
 
-    if A.backend.device isa DeviceCPU
-        # CPU path
-        local_gathered = @view plan.gathered_cpu[my_row_start:my_row_end]
-        partial_result = transpose(A.A) * local_gathered
-    else
-        # GPU path - use GPU gathered directly
-        local_gathered = @view plan.gathered[my_row_start:my_row_end]
-        # For Metal, views may not work directly - copy to contiguous array
-        local_gathered_gpu = similar(A.A, length(local_gathered))
-        copyto!(local_gathered_gpu, Array(local_gathered))
-        partial_result_gpu = transpose(A.A) * local_gathered_gpu
-        partial_result = Array(partial_result_gpu)  # Need CPU for Allreduce
-    end
+    # Unified CPU/GPU path:
+    # 1. Get slice and copy to contiguous array (fixes GPU view issues)
+    # 2. Compute on backend
+    # 3. Ensure CPU for Allreduce (no-op for CPU, copy for GPU)
+    local_gathered_slice = plan.gathered[my_row_start:my_row_end]
+    local_gathered_contiguous = copy(local_gathered_slice)
+    partial_result_backend = transpose(A.A) * local_gathered_contiguous
+    partial_result = _ensure_cpu(partial_result_backend)
 
     # Allreduce to sum contributions from all ranks
     full_result = comm_allreduce(comm, partial_result, +)
@@ -1271,8 +1247,8 @@ function Base.:*(At::Transpose{T,HPCMatrix{T,B}}, x::HPCVector{T,B}) where {T,B<
     my_col_end = A.col_partition[rank+2] - 1
     local_result_cpu = full_result[my_col_start:my_col_end]
 
-    # Copy to GPU if needed
-    local_result = (A.backend.device isa DeviceCPU) ? local_result_cpu : copyto!(similar(x.v, length(local_result_cpu)), local_result_cpu)
+    # Unified: _convert_array is no-op for CPU, copies for GPU
+    local_result = _convert_array(local_result_cpu, A.backend.device)
 
     # Create result vector (partition is immutable, no need to copy)
     y = HPCVector{T,B}(
@@ -1325,11 +1301,8 @@ function Base.:*(At::TransposedHPCMatrix{T,B}, Bmat::HPCMatrix{T,B}) where {T,B}
     result_partition = columns[1].partition
     local_m = result_partition[rank+2] - result_partition[rank+1]
 
-    # Build local matrix from column results (columns[k].v may be GPU array)
-    local_result = Matrix{T}(undef, local_m, n)
-    for k in 1:n
-        local_result[:, k] = Array(columns[k].v)  # Ensure CPU for HPCMatrix_local
-    end
+    # Build local matrix from column results (preserves GPU array type)
+    local_result = reduce(hcat, [columns[k].v for k in 1:n])
 
     return HPCMatrix_local(local_result, A.backend)
 end
@@ -1530,9 +1503,9 @@ function Base.mapslices(f, A::HPCMatrix{T,B}; dims) where {T,B}
         results = Vector{Any}(undef, n)
         for j in 1:n
             # Gather full column j from all ranks
-            # Convert to CPU for MPI communication (no-op for CPU arrays)
+            # Unified: _ensure_cpu is no-op for CPU, Array() for GPU
             local_col = A.A[:, j]
-            local_col_cpu = local_col isa Vector ? local_col : Vector(local_col)
+            local_col_cpu = _ensure_cpu(local_col)
             counts = Int32[A.row_partition[r+1] - A.row_partition[r] for r in 1:nranks]
             full_col = Vector{T}(undef, m_global)
             comm_allgatherv!(comm, local_col_cpu, MPI.VBuffer(full_col, counts))
diff --git a/src/mumps_factorization.jl b/src/mumps_factorization.jl
@@ -587,29 +587,12 @@ _get_mpi_comm(c::CommMPI) = c.comm
 _get_mpi_comm(::CommSerial) = error("Gatherv/Scatterv not supported for CommSerial in MUMPS solve")
 
 # Helper to copy values into a HPCVector (handles GPU arrays)
+# Unified: _convert_array handles CPU→GPU conversion, copyto! handles the copy
 function _copy_to_vector!(x::HPCVector{T,B}, values::Vector) where {T,B}
-    if x.backend.device isa DeviceCPU
-        x.v .= values
-    else
-        # GPU array - need to copy through appropriate method
-        copyto!(x.v, _convert_to_device_array(values, x.backend.device))
-    end
+    copyto!(x.v, _convert_array(values, x.backend.device))
     return x
 end
 
-# Convert a CPU vector to a target device
-function _convert_to_device_array(v::Vector{T}, device::AbstractDevice) where T
-    if device isa DeviceCPU
-        return v
-    else
-        # For GPU devices, use extension-defined function
-        return _array_to_device(v, device)
-    end
-end
-
-# Fallback for CPU device
-_array_to_device(v::Vector{T}, ::DeviceCPU) where T = v
-
 """
     Base.:\\(F::MUMPSFactorization, b::HPCVector)