@@ -4,42 +4,62 @@ import SparseArrays
44import SparseArrays: SparseMatrixCSC, SparseVector, nonzeroinds, nonzeros, findnz
55import Polyester: @batch
66
7+ export fastdensesparse!, fastdensesparse_threaded!
8+ export fastdensesparse_outer!, fastdensesparse_outer_threaded!
9+
10+ const VecOrView{T} = Union{Vector{T}, SubArray{T, 1 , Matrix{T}}}
11+ const MatOrView{T} = Union{Matrix{T}, SubArray{T, 2 , Matrix{T}}}
12+
713"""
814 fastdensesparse!(C, A, B, α, β)
915
10- BLAS like interface, computing `C .= β*C + α*A*B`, but accelerated with multi-threading using `Polyester.jl`.
16+ BLAS like interface, computing `C .= β*C + α*A*B`, but way faster than Base would.
17+
18+ Also see `fastdensesparse_threaded!` for a multi-threaded version using `Polyester.jl`.
1119"""
12- function fastdensesparse! (C:: Matrix {T} , A:: Matrix {T} , B:: SparseMatrixCSC{T} , α:: Number , β:: Number ) where T
20+ function fastdensesparse! (C:: MatOrView {T} , A:: MatOrView {T} , B:: SparseMatrixCSC{T} , α:: Number , β:: Number ) where T
1321 for j in axes (B, 2 )
1422 C[:, j] .*= β
1523 C[:, j] .+ = A * (α.* B[:, j])
1624 end
1725 return C
1826end
19- function fastdensesparse_threaded! (C:: Matrix{T} , A:: Matrix{T} , B:: SparseMatrixCSC{T} , α:: Number , β:: Number ) where T
27+
28+ """
29+ fastdensesparse!(C, A, B, α, β)
30+
31+ Threaded, BLAS like interface, computing `C .= β*C + α*A*B`, but way faster than Base would.
32+ Also see `fastdensesparse!` for a single-threaded version.
33+ """
34+ function fastdensesparse_threaded! (C:: MatOrView{T} , A:: MatOrView{T} , B:: SparseMatrixCSC{T} , α:: Number , β:: Number ) where T
2035 @batch for j in axes (B, 2 )
2136 C[:, j] .*= β
2237 C[:, j] .+ = A * (α.* B[:, j])
2338 end
2439 return C
2540end
2641
27- const VecOrView{T} = Union{Vector{T}, SubArray{T, 1 , Matrix{T}}}
28- # this one is slightly slower than `fastdensesparse_outer!`, probably because of extra allocations.
29- function _fastdensesparse_outer! (C:: Matrix{T} , A:: VecOrView{T} , b:: SparseVector{T} , α:: Number , β:: Number ) where T
30- for (j, X_val) in zip (findnz (b)... )
31- C[:, j] .*= β
32- C[:, j] .+ = (α* X_val) .* A # this compiles to something similar to axpy!, i.e. no allocations. Notice we need the dot also for the scalar mul.
33- end
34- return C
35- end
42+ """
43+ fastdensesparse_outer!(C, a, b, α, β)
3644
37- function fastdensesparse_outer! (C:: Matrix{T} , a:: VecOrView{T} , b:: SparseVector{T} , α:: Number , β:: Number ) where T
45+ Fast outer product when computing `C .= β*C + α * a*b'`, but way faster than Base would.
46+ - `a` is a dense vector (or view), `b` is a sparse vector, `C` is a dense matrix (or view).
47+ Also see `fastdensesparse_outer_threaded!` for a multi-threaded version using `Polyester.jl`.
48+ """
49+ function fastdensesparse_outer! (C:: MatOrView{T} , a:: VecOrView{T} , b:: SparseVector{T} , α:: Number , β:: Number ) where T
3850 C[:, nonzeroinds (b)] .+ = a * nonzeros (b)'
3951 return C
4052end
4153
42- function fastdensesparse_outer_threaded! (C:: Matrix{T} , a:: VecOrView{T} , b:: SparseVector{T} , α:: Number , β:: Number ) where T
54+ """
55+ fastdensesparse_outer_threaded!(C, a, b, α, β)
56+
57+ Threaded, fast outer product when computing `C .= β*C + α * a*b'`, but way faster than Base would, using `Polyester.jl`.
58+ - `a` is a dense vector (or view), `b` is a sparse vector, `C` is a dense matrix (or view).
59+
60+ Also see `fastdensesparse_outer!` for a single-threaded version.
61+ """
62+ function fastdensesparse_outer_threaded! (C:: MatOrView{T} , a:: VecOrView{T} , b:: SparseVector{T} , α:: Number , β:: Number ) where T
4363 inds = nonzeroinds (b)
4464 nzs = nonzeros (b)
4565 @batch for i in axes (nzs, 1 )
@@ -49,6 +69,4 @@ function fastdensesparse_outer_threaded!(C::Matrix{T}, a::VecOrView{T}, b::Spars
4969 return C
5070end
5171
52-
53-
5472end # module ThreadedDenseSparseMul
0 commit comments