@@ -618,16 +618,8 @@ function LinearAlgebra.mul!(y::HPCVector{T,B}, A::HPCMatrix{T,B}, x::HPCVector{T
618618 plan = get_dense_vector_plan (A, x)
619619 execute_plan! (plan, x)
620620
621- # Check if CPU or GPU based on device type
622- if A. backend. device isa DeviceCPU
623- # CPU path
624- y_local_cpu = Vector {T} (undef, length (y. v))
625- LinearAlgebra. mul! (y_local_cpu, A. A, plan. gathered_cpu)
626- copyto! (y. v, y_local_cpu)
627- else
628- # GPU path
629- LinearAlgebra. mul! (y. v, A. A, plan. gathered)
630- end
621+ # Unified CPU/GPU path: plan.gathered has correct type after execute_plan!
622+ LinearAlgebra. mul! (y. v, A. A, plan. gathered)
631623 return y
632624end
633625
@@ -653,16 +645,9 @@ function Base.:*(A::HPCMatrix{T,B}, x::HPCVector{T,B}) where {T,B<:HPCBackend}
653645 # Execute the plan to gather vector elements
654646 execute_plan! (plan, x)
655647
656- # Check if CPU or GPU based on device type
657- if A. backend. device isa DeviceCPU
658- # CPU path
659- y_v = Vector {T} (undef, local_rows)
660- LinearAlgebra. mul! (y_v, A. A, plan. gathered_cpu)
661- else
662- # GPU path
663- y_v = similar (A. A, local_rows)
664- LinearAlgebra. mul! (y_v, A. A, plan. gathered)
665- end
648+ # Unified CPU/GPU path: similar() preserves array type, plan.gathered has correct type
649+ y_v = similar (A. A, local_rows)
650+ LinearAlgebra. mul! (y_v, A. A, plan. gathered)
666651
667652 return HPCVector {T,B} (
668653 plan. result_partition_hash,
@@ -936,13 +921,9 @@ function execute_plan!(plan::DenseTransposePlan{T}, A::HPCMatrix{T,B}) where {T,
936921 plan. row_partition, plan. col_partition, size (result_AT), comm)
937922 end
938923
939- # Convert result to match input array type (CPU or GPU)
940- if ! (A. backend. device isa DeviceCPU)
941- # Input was GPU - convert result back to GPU
942- result_AT_gpu = copyto! (similar (A. A, size (result_AT)), result_AT)
943- return HPCMatrix {T,B} (plan. structural_hash, plan. row_partition, plan. col_partition, result_AT_gpu, A. backend)
944- end
945- return HPCMatrix {T,B} (plan. structural_hash, plan. row_partition, plan. col_partition, result_AT, A. backend)
924+ # Unified CPU/GPU path: _convert_array is no-op for CPU, copies for GPU
925+ result_A = _convert_array (result_AT, A. backend. device)
926+ return HPCMatrix {T,B} (plan. structural_hash, plan. row_partition, plan. col_partition, result_A, A. backend)
946927end
947928
948929"""
@@ -1249,19 +1230,14 @@ function Base.:*(At::Transpose{T,HPCMatrix{T,B}}, x::HPCVector{T,B}) where {T,B<
12491230 my_row_start = A. row_partition[rank+ 1 ]
12501231 my_row_end = A. row_partition[rank+ 2 ] - 1
12511232
1252- if A. backend. device isa DeviceCPU
1253- # CPU path
1254- local_gathered = @view plan. gathered_cpu[my_row_start: my_row_end]
1255- partial_result = transpose (A. A) * local_gathered
1256- else
1257- # GPU path - use GPU gathered directly
1258- local_gathered = @view plan. gathered[my_row_start: my_row_end]
1259- # For Metal, views may not work directly - copy to contiguous array
1260- local_gathered_gpu = similar (A. A, length (local_gathered))
1261- copyto! (local_gathered_gpu, Array (local_gathered))
1262- partial_result_gpu = transpose (A. A) * local_gathered_gpu
1263- partial_result = Array (partial_result_gpu) # Need CPU for Allreduce
1264- end
1233+ # Unified CPU/GPU path:
1234+ # 1. Get slice and copy to contiguous array (fixes GPU view issues)
1235+ # 2. Compute on backend
1236+ # 3. Ensure CPU for Allreduce (no-op for CPU, copy for GPU)
1237+ local_gathered_slice = plan. gathered[my_row_start: my_row_end]
1238+ local_gathered_contiguous = copy (local_gathered_slice)
1239+ partial_result_backend = transpose (A. A) * local_gathered_contiguous
1240+ partial_result = _ensure_cpu (partial_result_backend)
12651241
12661242 # Allreduce to sum contributions from all ranks
12671243 full_result = comm_allreduce (comm, partial_result, + )
@@ -1271,8 +1247,8 @@ function Base.:*(At::Transpose{T,HPCMatrix{T,B}}, x::HPCVector{T,B}) where {T,B<
12711247 my_col_end = A. col_partition[rank+ 2 ] - 1
12721248 local_result_cpu = full_result[my_col_start: my_col_end]
12731249
1274- # Copy to GPU if needed
1275- local_result = ( A. backend. device isa DeviceCPU) ? local_result_cpu : copyto! ( similar (x . v, length (local_result_cpu)), local_result_cpu )
1250+ # Unified: _convert_array is no-op for CPU, copies for GPU
1251+ local_result = _convert_array (local_result_cpu, A. backend. device)
12761252
12771253 # Create result vector (partition is immutable, no need to copy)
12781254 y = HPCVector {T,B} (
@@ -1325,11 +1301,8 @@ function Base.:*(At::TransposedHPCMatrix{T,B}, Bmat::HPCMatrix{T,B}) where {T,B}
13251301 result_partition = columns[1 ]. partition
13261302 local_m = result_partition[rank+ 2 ] - result_partition[rank+ 1 ]
13271303
1328- # Build local matrix from column results (columns[k].v may be GPU array)
1329- local_result = Matrix {T} (undef, local_m, n)
1330- for k in 1 : n
1331- local_result[:, k] = Array (columns[k]. v) # Ensure CPU for HPCMatrix_local
1332- end
1304+ # Build local matrix from column results (preserves GPU array type)
1305+ local_result = reduce (hcat, [columns[k]. v for k in 1 : n])
13331306
13341307 return HPCMatrix_local (local_result, A. backend)
13351308end
@@ -1530,9 +1503,9 @@ function Base.mapslices(f, A::HPCMatrix{T,B}; dims) where {T,B}
15301503 results = Vector {Any} (undef, n)
15311504 for j in 1 : n
15321505 # Gather full column j from all ranks
1533- # Convert to CPU for MPI communication ( no-op for CPU arrays)
1506+ # Unified: _ensure_cpu is no-op for CPU, Array() for GPU
15341507 local_col = A. A[:, j]
1535- local_col_cpu = local_col isa Vector ? local_col : Vector (local_col)
1508+ local_col_cpu = _ensure_cpu (local_col)
15361509 counts = Int32[A. row_partition[r+ 1 ] - A. row_partition[r] for r in 1 : nranks]
15371510 full_col = Vector {T} (undef, m_global)
15381511 comm_allgatherv! (comm, local_col_cpu, MPI. VBuffer (full_col, counts))
0 commit comments