Skip to content

Commit 899201a

Browse files
committed
single CompressedLBFGSOperator commit (gathered in a single commit for easier rebase)
1 parent 04479cf commit 899201a

6 files changed

Lines changed: 299 additions & 2 deletions

File tree

Project.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ version = "2.12.0"
66
FastClosures = "9aa1b823-49e4-5ca5-8b0f-3971ec8bab6a"
77
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
88
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
9+
Requires = "ae029012-a4dd-5104-9daa-d747884805df"
910
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
1011
TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
1112

@@ -37,4 +38,4 @@ Metal = "1.9.1"
3738
Printf = "1.10"
3839
SparseArrays = "1.10"
3940
TimerOutputs = "0.5"
40-
julia = "1.10"
41+
julia = "1.10"

src/compressed_lbfgs.jl

Lines changed: 270 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,270 @@
1+
#=
2+
Compressed LBFGS implementation from:
3+
REPRESENTATIONS OF QUASI-NEWTON MATRICES AND THEIR USE IN LIMITED MEMORY METHODS
4+
Richard H. Byrd, Jorge Nocedal and Robert B. Schnabel (1994)
5+
DOI: 10.1007/BF01582063
6+
7+
Implemented by Paul Raynaud (supervised by Dominique Orban)
8+
=#
9+
10+
using LinearAlgebra, LinearAlgebra.BLAS
11+
using Requires
12+
13+
export CompressedLBFGSOperator, CompressedLBFGSData
14+
# export default_matrix_type, default_vector_type
15+
16+
default_matrix_type(; T::DataType=Float64) = Matrix{T}
17+
default_vector_type(; T::DataType=Float64) = Vector{T}
18+
19+
@init begin
20+
@require CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" begin
21+
default_matrix_type(; T::DataType=Float64) = CUDA.functional() ? CUDA.CuMatrix{T, CUDA.Mem.DeviceBuffer} : Matrix{T}
22+
default_vector_type(; T::DataType=Float64) = CUDA.functional() ? CUDA.CuVector{T, CUDA.Mem.DeviceBuffer} : Vector{T}
23+
end
24+
# this scheme may be extended to other GPU backend modules
25+
end
26+
27+
function columnshift!(A::AbstractMatrix{T}; direction::Int=-1, indicemax::Int=size(A)[1]) where T
28+
map(i-> view(A,:,i+direction) .= view(A,:,i), 1-direction:indicemax)
29+
return A
30+
end
31+
32+
function vectorshift!(v::AbstractVector{T}; direction::Int=-1, indicemax::Int=length(v)) where T
33+
view(v, 1:indicemax+direction) .= view(v,1-direction:indicemax)
34+
return v
35+
end
36+
37+
"""
38+
CompressedLBFGSData{T, M<:AbstractMatrix{T}, V<:AbstractVector{T}}
39+
40+
A LBFGS limited-memory operator.
41+
It represents a linear application Rⁿˣⁿ, considering at most `mem` BFGS updates.
42+
This implementation considers the bloc matrices reoresentation of the BFGS (forward) update.
43+
It follows the algorithm described in [REPRESENTATIONS OF QUASI-NEWTON MATRICES AND THEIR USE IN LIMITED MEMORY METHODS](https://link.springer.com/article/10.1007/BF01582063) from Richard H. Byrd, Jorge Nocedal and Robert B. Schnabel (1994).
44+
This operator considers several fields directly related to the bloc representation of the operator:
45+
- `mem`: the maximal memory of the operator;
46+
- `n`: the dimension of the linear application;
47+
- `k`: the current memory's size of the operator;
48+
- `α`: scalar for `B₀ = α I`;
49+
- `Sₖ`: retain the `k`-th last vectors `s` from the updates parametrized by `(s,y)`;
50+
- `Yₖ`: retain the `k`-th last vectors `y` from the updates parametrized by `(s,y)`;;
51+
- `Dₖ`: a diagonal matrix mandatory to perform the linear application and to form the matrix;
52+
- `Lₖ`: a lower diagonal mandatory to perform the linear application and to form the matrix.
53+
In addition to this structures which are circurlarly update when `k` reaches `mem`, we consider other intermediate data structures renew at each update:
54+
- `chol_matrix`: a matrix required to store a Cholesky factorization of a Rᵏˣᵏ matrix;
55+
- `intermediate_1`: a R²ᵏˣ²ᵏ matrix;
56+
- `intermediate_2`: a R²ᵏˣ²ᵏ matrix;
57+
- `inverse_intermediate_1`: a R²ᵏˣ²ᵏ matrix;
58+
- `inverse_intermediate_2`: a R²ᵏˣ²ᵏ matrix;
59+
- `intermediary_vector`: a vector ∈ Rᵏ to store intermediate solutions;
60+
- `sol`: a vector ∈ Rᵏ to store intermediate solutions;
61+
This implementation is designed to work either on CPU or GPU.
62+
"""
63+
mutable struct CompressedLBFGSData{T, M<:AbstractMatrix{T}, V<:AbstractVector{T}, I <: Integer}
64+
mem::Int # memory of the operator
65+
n::I # vector size
66+
k::I # k ≤ mem, active memory of the operator
67+
α::T # B₀ = αI
68+
Sₖ::M # gather all sₖ₋ₘ : n * mem
69+
Yₖ::M # gather all yₖ₋ₘ : n * mem
70+
Dₖ::Diagonal{T,V} # mem * mem
71+
Lₖ::LowerTriangular{T,M} # mem * mem
72+
73+
chol_matrix::M # 2mem * 2mem
74+
intermediate_diagonal::Diagonal{T,V} # mem * mem
75+
intermediate_1::UpperTriangular{T,M} # 2mem * 2mem
76+
intermediate_2::LowerTriangular{T,M} # 2mem * 2mem
77+
inverse_intermediate_1::UpperTriangular{T,M} # 2mem * 2mem
78+
inverse_intermediate_2::LowerTriangular{T,M} # 2mem * 2mem
79+
intermediary_vector::V # 2mem
80+
sol::V # 2mem
81+
82+
nprod::I
83+
end
84+
85+
"""
86+
CompressedLBFGSData(n::Int; [T=Float64, mem=5], gpu:Bool)
87+
88+
A implementation of a LBFGS operator (forward), representing a `nxn` linear application.
89+
It considers at most `k` BFGS iterates, and fit the architecture depending if it is launched on a CPU or a GPU.
90+
"""
91+
function CompressedLBFGSData(n::I; mem::I=5, T=Float64, M=default_matrix_type(; T), V=default_vector_type(; T)) where {I<:Integer}
92+
α = (T)(1)
93+
k = 0
94+
Sₖ = M(undef, n, mem)
95+
Yₖ = M(undef, n, mem)
96+
Dₖ = Diagonal(V(undef, mem))
97+
Lₖ = LowerTriangular(M(undef, mem, mem))
98+
Lₖ.data .= zero(T)
99+
100+
chol_matrix = M(undef, mem, mem)
101+
intermediate_diagonal = Diagonal(V(undef, mem))
102+
intermediate_1 = UpperTriangular(M(undef, 2*mem, 2*mem))
103+
intermediate_2 = LowerTriangular(M(undef, 2*mem, 2*mem))
104+
inverse_intermediate_1 = UpperTriangular(M(undef, 2*mem, 2*mem))
105+
inverse_intermediate_2 = LowerTriangular(M(undef, 2*mem, 2*mem))
106+
intermediary_vector = V(undef, 2*mem)
107+
sol = V(undef, 2*mem)
108+
109+
nprod = 0
110+
111+
return CompressedLBFGSData{T,M,V,I}(mem, n, k, α, Sₖ, Yₖ, Dₖ, Lₖ, chol_matrix, intermediate_diagonal, intermediate_1, intermediate_2, inverse_intermediate_1, inverse_intermediate_2, intermediary_vector, sol, nprod)
112+
end
113+
114+
mutable struct CompressedLBFGSOperator{T, M<:AbstractMatrix{T}, V<:AbstractVector{T}, F, I <: Integer} <: AbstractQuasiNewtonOperator{T}
115+
nrow::I
116+
ncol::I
117+
symmetric::Bool
118+
hermitian::Bool
119+
Bv::V
120+
data::CompressedLBFGSData{T,M,V}
121+
prod!::F # apply the operator to a vector
122+
tprod!::F # apply the transpose operator to a vector
123+
ctprod!::F # apply the transpose conjugate operator to a vector
124+
end
125+
126+
function CompressedLBFGSOperator(n::I; mem::I=5, T=Float64, M=default_matrix_type(; T), V=default_vector_type(; T)) where {I <: Integer}
127+
nrow = n
128+
ncol = n
129+
symmetric = true
130+
hermitian = true
131+
Bv = V(undef, n)
132+
data = CompressedLBFGSData(n; mem, T, M, V)
133+
134+
prod! = @closure (res, v, α, β) -> begin
135+
mul!(Bv, data, v)
136+
if β == zero(T)
137+
res .= α .* Bv
138+
else
139+
res .= α .* Bv .+ β .* res
140+
end
141+
end
142+
143+
F = typeof(prod!)
144+
145+
return CompressedLBFGSOperator{T,M,V,F,I}(nrow, ncol, symmetric, hermitian, Bv, data, prod!, prod!, prod!)
146+
end
147+
148+
has_args5(op::CompressedLBFGSOperator) = true
149+
use_prod5!(op::CompressedLBFGSOperator) = true
150+
151+
Base.push!(op::CompressedLBFGSOperator{T,M,V}, s::V, y::V) where {T,M,V<:AbstractVector{T}} = Base.push!(op.data, s, y)
152+
function Base.push!(data::CompressedLBFGSData{T,M,V}, s::V, y::V) where {T,M,V<:AbstractVector{T}}
153+
if data.k < data.mem # still some place in the structures
154+
data.k += 1
155+
view(data.Sₖ, :, data.k) .= s
156+
view(data.Yₖ, :, data.k) .= y
157+
view(data.Dₖ.diag, data.k) .= dot(s, y)
158+
mul!(view(data.Lₖ.data, data.k, 1:data.k-1), transpose(view(data.Yₖ, :, 1:data.k-1)), view(data.Sₖ, :, data.k) )
159+
else # k == mem update circurlarly the intermediary structures
160+
columnshift!(data.Sₖ; indicemax=data.k)
161+
columnshift!(data.Yₖ; indicemax=data.k)
162+
# data.Dₖ .= circshift(data.Dₖ, (-1, -1))
163+
vectorshift!(data.Dₖ.diag; indicemax=data.k)
164+
view(data.Sₖ, :, data.k) .= s
165+
view(data.Yₖ, :, data.k) .= y
166+
view(data.Dₖ.diag, data.k) .= dot(s, y)
167+
168+
map(i-> view(data.Lₖ, i:data.mem-1, i-1) .= view(data.Lₖ, i+1:data.mem, i), 2:data.mem)
169+
mul!(view(data.Lₖ.data, data.k, 1:data.k-1), transpose(view(data.Yₖ, :, 1:data.k-1)), view(data.Sₖ, :, data.k) )
170+
end
171+
172+
# step 4 and 6
173+
precompile_iterated_structure!(data)
174+
175+
# secant equation fails if uncommented
176+
# data.α = dot(y,s)/dot(s,s)
177+
return data
178+
end
179+
180+
# Algorithm 3.2 (p15)
181+
# Theorem 2.3 (p6)
182+
Base.Matrix(op::CompressedLBFGSOperator{T,M,V}) where {T,M,V} = Base.Matrix(op.data)
183+
function Base.Matrix(data::CompressedLBFGSData{T,M,V}) where {T,M,V}
184+
B₀ = M(undef, data.n, data.n)
185+
map(i -> B₀[i, i] = data.α, 1:data.n)
186+
187+
BSY = M(undef, data.n, 2*data.k)
188+
(data.k > 0) && (BSY[:, 1:data.k] = B₀ * data.Sₖ[:, 1:data.k])
189+
(data.k > 0) && (BSY[:, data.k+1:2*data.k] = data.Yₖ[:, 1:data.k])
190+
_C = M(undef, 2*data.k, 2*data.k)
191+
(data.k > 0) && (_C[1:data.k, 1:data.k] .= transpose(data.Sₖ[:, 1:data.k]) * data.Sₖ[:, 1:data.k])
192+
(data.k > 0) && (_C[1:data.k, data.k+1:2*data.k] .= data.Lₖ[1:data.k, 1:data.k])
193+
(data.k > 0) && (_C[data.k+1:2*data.k, 1:data.k] .= transpose(data.Lₖ[1:data.k, 1:data.k]))
194+
(data.k > 0) && (_C[data.k+1:2*data.k, data.k+1:2*data.k] .-= data.Dₖ[1:data.k, 1:data.k])
195+
C = inv(_C)
196+
197+
Bₖ = B₀ .- BSY * C * transpose(BSY)
198+
return Bₖ
199+
end
200+
201+
# Algorithm 3.2 (p15)
202+
# step 4, Jₖ is computed only if needed
203+
function inverse_cholesky(data::CompressedLBFGSData{T,M,V}) where {T,M,V}
204+
view(data.intermediate_diagonal.diag, 1:data.k) .= inv.(view(data.Dₖ.diag, 1:data.k))
205+
206+
mul!(view(data.inverse_intermediate_1, 1:data.k, 1:data.k), view(data.intermediate_diagonal, 1:data.k, 1:data.k), transpose(view(data.Lₖ, 1:data.k, 1:data.k)))
207+
mul!(view(data.chol_matrix, 1:data.k, 1:data.k), view(data.Lₖ, 1:data.k, 1:data.k), view(data.inverse_intermediate_1, 1:data.k, 1:data.k))
208+
209+
mul!(view(data.chol_matrix, 1:data.k, 1:data.k), transpose(view(data.Sₖ, :, 1:data.k)), view(data.Sₖ, :, 1:data.k), data.α, (T)(1))
210+
211+
cholesky!(Symmetric(view(data.chol_matrix, 1:data.k, 1:data.k)))
212+
Jₖ = transpose(UpperTriangular(view(data.chol_matrix, 1:data.k, 1:data.k)))
213+
return Jₖ
214+
end
215+
216+
# step 6, must be improve
217+
function precompile_iterated_structure!(data::CompressedLBFGSData)
218+
Jₖ = inverse_cholesky(data)
219+
220+
# constant update
221+
view(data.intermediate_1, data.k+1:2*data.k, 1:data.k) .= 0
222+
view(data.intermediate_2, 1:data.k, data.k+1:2*data.k) .= 0
223+
view(data.intermediate_1, data.k+1:2*data.k, data.k+1:2*data.k) .= transpose(Jₖ)
224+
view(data.intermediate_2, data.k+1:2*data.k, data.k+1:2*data.k) .= Jₖ
225+
226+
# updates related to D^(1/2)
227+
view(data.intermediate_diagonal.diag, 1:data.k) .= sqrt.(view(data.Dₖ.diag, 1:data.k))
228+
view(data.intermediate_1, 1:data.k,1:data.k) .= .- view(data.intermediate_diagonal, 1:data.k, 1:data.k)
229+
view(data.intermediate_2, 1:data.k, 1:data.k) .= view(data.intermediate_diagonal, 1:data.k, 1:data.k)
230+
231+
# updates related to D^(-1/2)
232+
view(data.intermediate_diagonal.diag, 1:data.k) .= (x -> 1/sqrt(x)).(view(data.Dₖ.diag, 1:data.k))
233+
mul!(view(data.intermediate_1, 1:data.k,data.k+1:2*data.k), view(data.intermediate_diagonal, 1:data.k, 1:data.k), transpose(view(data.Lₖ, 1:data.k, 1:data.k)))
234+
mul!(view(data.intermediate_2, data.k+1:2*data.k, 1:data.k), view(data.Lₖ, 1:data.k, 1:data.k), view(data.intermediate_diagonal, 1:data.k, 1:data.k))
235+
view(data.intermediate_2, data.k+1:2*data.k, 1:data.k) .= view(data.intermediate_2, data.k+1:2*data.k, 1:data.k) .* -1
236+
237+
view(data.inverse_intermediate_1, 1:2*data.k, 1:2*data.k) .= inv(data.intermediate_1[1:2*data.k, 1:2*data.k])
238+
view(data.inverse_intermediate_2, 1:2*data.k, 1:2*data.k) .= inv(data.intermediate_2[1:2*data.k, 1:2*data.k])
239+
end
240+
241+
# Algorithm 3.2 (p15)
242+
LinearAlgebra.mul!(Bv::V, op::CompressedLBFGSOperator{T,M,V}, v::V) where {T,M,V<:AbstractVector{T}} = LinearAlgebra.mul!(Bv, op.data, v)
243+
function LinearAlgebra.mul!(Bv::V, data::CompressedLBFGSData{T,M,V}, v::V) where {T,M,V<:AbstractVector{T}}
244+
data.nprod += 1
245+
# step 1-4 and 6 mainly done by Base.push!
246+
# step 5
247+
mul!(view(data.sol, 1:data.k), transpose(view(data.Yₖ, :, 1:data.k)), v)
248+
mul!(view(data.sol, data.k+1:2*data.k), transpose(view(data.Sₖ, :, 1:data.k)), v)
249+
# scal!(data.α, view(data.sol, data.k+1:2*data.k)) # more allocation, slower
250+
view(data.sol, data.k+1:2*data.k) .*= data.α
251+
252+
mul!(view(data.intermediary_vector, 1:2*data.k), view(data.inverse_intermediate_2, 1:2*data.k, 1:2*data.k), view(data.sol, 1:2*data.k))
253+
mul!(view(data.sol, 1:2*data.k), view(data.inverse_intermediate_1, 1:2*data.k, 1:2*data.k), view(data.intermediary_vector, 1:2*data.k))
254+
255+
# step 7
256+
mul!(Bv, view(data.Yₖ, :, 1:data.k), view(data.sol, 1:data.k))
257+
mul!(Bv, view(data.Sₖ, :, 1:data.k), view(data.sol, data.k+1:2*data.k), - data.α, (T)(-1))
258+
Bv .+= data.α .* v
259+
return Bv
260+
end
261+
262+
"""
263+
reset!(op)
264+
265+
Resets the CompressedLBFGS data of the given operator.
266+
"""
267+
function reset!(op::CompressedLBFGSOperator)
268+
op.data.nprod = 0
269+
return op
270+
end

src/qn.jl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,5 @@ import LinearAlgebra.diag
55

66
include("lbfgs.jl")
77
include("lsr1.jl")
8+
9+
include("compressed_lbfgs.jl")

test/gpu/nvidia.jl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,5 @@ using LinearOperators, CUDA, CUDA.CUSPARSE, CUDA.CUSOLVER
2121

2222
@testset "Nvidia S kwarg" test_S_kwarg(arrayType = CuArray)
2323
end
24+
25+
include("../test_clbfgs.jl")

test/runtests.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ include("test_linop_allocs.jl")
88
include("test_adjtrans.jl")
99
include("test_cat.jl")
1010
include("test_lbfgs.jl")
11+
include("test_clbfgs.jl")
1112
include("test_lsr1.jl")
1213
include("test_kron.jl")
1314
include("test_callable.jl")
@@ -21,4 +22,4 @@ include("gpu/test_S_kwarg.jl")
2122
include("gpu/jlarrays.jl")
2223
if Sys.isapple() && occursin("arm64", Sys.MACHINE)
2324
include("gpu/metal.jl")
24-
end
25+
end

test/test_clbfgs.jl

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
@testset "CompressedLBFGSOperator operator" begin
2+
iter=50
3+
n=100
4+
n=5
5+
types = [Float32, Float64]
6+
for T in types
7+
lbfgs = CompressedLBFGSOperator(n; T) # mem=5
8+
V = LinearOperators.default_vector_type(;T)
9+
Bv = V(rand(T, n))
10+
s = V(rand(T, n))
11+
mul!(Bv, lbfgs, s) # warm-up
12+
for i in 1:iter
13+
s = V(rand(T, n))
14+
y = V(rand(T, n))
15+
push!(lbfgs, s, y)
16+
allocs = @allocated mul!(Bv, lbfgs, s)
17+
@test allocs == 0
18+
@test Bv y
19+
end
20+
end
21+
end

0 commit comments

Comments
 (0)