Skip to content

Commit 1f9209d

Browse files
committed
Bugfix, and Try to get rid of some extra allocations.
1 parent 917ac43 commit 1f9209d

1 file changed

Lines changed: 25 additions & 17 deletions

File tree

src/ThreadedDenseSparseMul.jl

Lines changed: 25 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,13 @@ BLAS like interface, computing `C .= β*C + α*A*B`, but way faster than Base wo
1818
Also see `fastdensesparsemul_threaded!` for a multi-threaded version using `Polyester.jl`.
1919
"""
2020
function fastdensesparsemul!(C::MatOrView{T}, A::MatOrView{T}, B::SparseMatrixCSC{T}, α::Number, β::Number) where T
21-
for j in axes(B, 2)
22-
C[:, j] .*= β
23-
C[:, j] .+= A *.*B[:, j])
21+
@fastmath @inbounds begin
22+
C .*= β
23+
for j in axes(B, 2)
24+
C[:, j] .+= A *.*B[:, j])
25+
end
26+
return C
2427
end
25-
return C
2628
end
2729

2830
"""
@@ -32,11 +34,13 @@ Threaded, BLAS like interface, computing `C .= β*C + α*A*B`, but way faster th
3234
Also see `fastdensesparsemul!` for a single-threaded version.
3335
"""
3436
function fastdensesparsemul_threaded!(C::MatOrView{T}, A::MatOrView{T}, B::SparseMatrixCSC{T}, α::Number, β::Number) where T
35-
@batch for j in axes(B, 2)
36-
C[:, j] .*= β
37-
C[:, j] .+= A *.*B[:, j])
37+
@fastmath @inbounds begin
38+
C .*= β
39+
@batch for j in axes(B, 2)
40+
C[:, j] .+= A *.*B[:, j])
41+
end
42+
return C
3843
end
39-
return C
4044
end
4145

4246
"""
@@ -47,9 +51,11 @@ Fast outer product when computing `C .= β*C + α * a*b'`, but way faster than B
4751
Also see `fastdensesparsemul_outer_threaded!` for a multi-threaded version using `Polyester.jl`.
4852
"""
4953
function fastdensesparsemul_outer!(C::MatOrView{T}, a::VecOrView{T}, b::SparseVector{T}, α::Number, β::Number) where T
50-
C[:, nonzeroinds(b)] .*= β
51-
C[:, nonzeroinds(b)] .+= a *.*nonzeros(b))'
52-
return C
54+
@fastmath @inbounds begin
55+
C .*= β
56+
C[:, nonzeroinds(b)] .+= a *.*nonzeros(b)')
57+
return C
58+
end
5359
end
5460

5561
"""
@@ -61,13 +67,15 @@ Threaded, fast outer product when computing `C .= β*C + α * a*b'`, but way fas
6167
Also see `fastdensesparsemul_outer!` for a single-threaded version.
6268
"""
6369
function fastdensesparsemul_outer_threaded!(C::MatOrView{T}, a::VecOrView{T}, b::SparseVector{T}, α::Number, β::Number) where T
64-
inds = nonzeroinds(b)
65-
nzs = nonzeros(b)
66-
@batch for i in axes(nzs, 1)
67-
C[:, inds[i]] .*= β
68-
C[:, inds[i]] .+=.*nzs[i]).*a
70+
@fastmath @inbounds begin
71+
inds = nonzeroinds(b)
72+
nzs = nonzeros(b)
73+
C .*= β
74+
@batch for i in axes(nzs, 1)
75+
C[:, inds[i]] .+=*nzs[i]).*a
76+
end
77+
return C
6978
end
70-
return C
7179
end
7280

7381
end # module ThreadedDenseSparseMul

0 commit comments

Comments
 (0)