@@ -18,11 +18,13 @@ BLAS like interface, computing `C .= β*C + α*A*B`, but way faster than Base wo
1818Also see `fastdensesparsemul_threaded!` for a multi-threaded version using `Polyester.jl`.
1919"""
2020function fastdensesparsemul! (C:: MatOrView{T} , A:: MatOrView{T} , B:: SparseMatrixCSC{T} , α:: Number , β:: Number ) where T
21- for j in axes (B, 2 )
22- C[:, j] .*= β
23- C[:, j] .+ = A * (α.* B[:, j])
21+ @fastmath @inbounds begin
22+ C .*= β
23+ for j in axes (B, 2 )
24+ C[:, j] .+ = A * (α.* B[:, j])
25+ end
26+ return C
2427 end
25- return C
2628end
2729
2830"""
@@ -32,11 +34,13 @@ Threaded, BLAS like interface, computing `C .= β*C + α*A*B`, but way faster th
3234Also see `fastdensesparsemul!` for a single-threaded version.
3335"""
3436function fastdensesparsemul_threaded! (C:: MatOrView{T} , A:: MatOrView{T} , B:: SparseMatrixCSC{T} , α:: Number , β:: Number ) where T
35- @batch for j in axes (B, 2 )
36- C[:, j] .*= β
37- C[:, j] .+ = A * (α.* B[:, j])
37+ @fastmath @inbounds begin
38+ C .*= β
39+ @batch for j in axes (B, 2 )
40+ C[:, j] .+ = A * (α.* B[:, j])
41+ end
42+ return C
3843 end
39- return C
4044end
4145
4246"""
@@ -47,9 +51,11 @@ Fast outer product when computing `C .= β*C + α * a*b'`, but way faster than B
4751Also see `fastdensesparsemul_outer_threaded!` for a multi-threaded version using `Polyester.jl`.
4852"""
4953function fastdensesparsemul_outer! (C:: MatOrView{T} , a:: VecOrView{T} , b:: SparseVector{T} , α:: Number , β:: Number ) where T
50- C[:, nonzeroinds (b)] .*= β
51- C[:, nonzeroinds (b)] .+ = a * (α.* nonzeros (b))'
52- return C
54+ @fastmath @inbounds begin
55+ C .*= β
56+ C[:, nonzeroinds (b)] .+ = a * (α.* nonzeros (b)' )
57+ return C
58+ end
5359end
5460
5561"""
@@ -61,13 +67,15 @@ Threaded, fast outer product when computing `C .= β*C + α * a*b'`, but way fas
6167Also see `fastdensesparsemul_outer!` for a single-threaded version.
6268"""
6369function fastdensesparsemul_outer_threaded! (C:: MatOrView{T} , a:: VecOrView{T} , b:: SparseVector{T} , α:: Number , β:: Number ) where T
64- inds = nonzeroinds (b)
65- nzs = nonzeros (b)
66- @batch for i in axes (nzs, 1 )
67- C[:, inds[i]] .*= β
68- C[:, inds[i]] .+ = (α.* nzs[i]). * a
70+ @fastmath @inbounds begin
71+ inds = nonzeroinds (b)
72+ nzs = nonzeros (b)
73+ C .*= β
74+ @batch for i in axes (nzs, 1 )
75+ C[:, inds[i]] .+ = (α* nzs[i]). * a
76+ end
77+ return C
6978 end
70- return C
7179end
7280
7381end # module ThreadedDenseSparseMul
0 commit comments