Skip to content

Commit 09af0b8

Browse files
bug fix with CUDA and gradient calculations.
1 parent ac80033 commit 09af0b8

3 files changed

Lines changed: 44 additions & 9 deletions

File tree

examples/gauss_fit.jl

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@ many_sig = true
1414

1515
use_cuda = false
1616
DType = Float32
17-
N = 10_000
17+
# N = 10_000
18+
N = 1000
1819
hp_off = many_off ? 2 .*rand(DType, (1, N)) : 0
1920
hp_sig = many_sig ? zeros(DType, (1, N)) : 0
2021
hp_int = many_int ? 1 .+ rand(DType, (1, N)) : 1
@@ -57,7 +58,7 @@ myfg! = get_fg!(dat, gaussian_raw, length(sz); loss=loss_poisson_pos);
5758
# sigma = CuArray(sigma)
5859
# end
5960
# startvals = DType.(ComponentVector(;bg=bg, intensity=intensity, off = soff, args = sigma))
60-
opt = Optim.Options(iterations = 1500); #
61+
opt = Optim.Options(iterations = 150); #
6162

6263
if (false)
6364
G = copy(startvals)
@@ -86,6 +87,30 @@ reso.minimum #
8687
success = sum(abs.(collect(startvals.off) .- vec_true.off), dims=1) .< 0.5
8788
success = success .&& sum(abs.(collect(reso.minimizer.off) .- vec_true.off), dims=1) .< 0.5
8889
sum(.!success)
90+
91+
# Try the same by calling optimizations in parallel (seems to work better!):
92+
if false
93+
szz = size(dat)[end]
94+
# convert the startvals into a vector of individual startvals
95+
dat_a = Array(dat)
96+
svv = [DType.(ComponentVector(gauss_start(dat_a[:,:,n:n], 0.2, length(sz)))) for n in 1:szz];
97+
if (use_cuda)
98+
svv = [ComponentVector(;bg=CuArray(svv[n].bg), intensity=CuArray(svv[n].intensity),
99+
off = CuArray(svv[n].off), args = CuArray(svv[n].args)) for n in 1:szz]
100+
end
101+
# svb.args = svb.args .* 1.2f0
102+
myfg!v = [get_fg!(dat[:,:,n:n], gaussian_raw, length(sz); loss=loss_poisson_pos) for n in 1:szz];
103+
104+
odov = [OnceDifferentiable(Optim.NLSolversBase.only_fg!(myfg!v[n]), svv[n]) for n in 1:szz];
105+
@time resov = Optim.optimize.(odov, svv, Ref(Optim.LBFGS()), Ref(opt));
106+
# seeems faster!
107+
svvoff = cat([svv[n].off for n in 1:szz]..., dims=2)
108+
resoff = cat([resov[n].minimizer.off for n in 1:szz]..., dims=2)
109+
success = sum(abs.(collect(svvoff) .- vec_true.off), dims=1) .< 0.5
110+
success = success .&& sum(abs.(collect(resoff) .- vec_true.off), dims=1) .< 0.5
111+
# all worked!
112+
sum(.!success)
113+
end
89114
# findfirst(.!success)
90115
@vt collect(dat)[:,:,.!success[:]] collect(pdat)[:,:,.!success[:]] collect(gaussian_vec(sz, startvals))[:,:,.!success[:]] collect(gaussian_vec(sz, reso.minimizer))[:,:,.!success[:]]
91116

src/general.jl

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -58,21 +58,30 @@ function calculate_separables_nokw(::Type{AT}, fct, sz::NTuple{N, Int},
5858
# args_1d = ntuple((d) -> arg_n(d, args), Val(N))
5959
# in_place_assing!.(all_axes, 1, fct, idcs, sz, args_1d)
6060
for (res, sz1d, d) in zip(all_axes, sz, 1:N)
61-
off = get_vec_dim(offset, d, sz)
62-
sca = get_vec_dim(scale, d, sz)
61+
# off = get_vec_dim(offset, d, sz) # not needed any more since in get_1d_ids
62+
# sca = get_vec_dim(scale, d, sz)
6363
idc = get_1d_ids(d, sz, offset, scale)
6464
args_d = arg_n(d, args, RT, sz) #
6565
# in_place_assing!(res, 1, fct, idc, sz1d, args_d)
6666
# @show size(res)
67-
# @show size(idc)
6867
res .= fct.(idc, sz1d, args_d...) # 5 allocs, 160 bytes
6968
end
7069
return all_axes
7170
# return res
7271
end
7372

73+
"""
74+
get_1d_ids(d, sz::NTuple{N, Int}, offset, scale) where {N}
75+
76+
returns one-dimensional indices for a given dimension `d` of an N-dimensional array.
77+
The indices are shifted by `offset` and scaled by `scale`, which can also be vectors
78+
"""
79+
# for Numbers, the reorient comes last, to have it CUDA-compatible
80+
get_1d_ids(d, sz::NTuple{N, Int}, offset::Number, scale::Number) where {N} = (reorient(get_vec_dim(scale, d, sz) .* ((1:sz[d]) .- get_vec_dim(offset, d, sz)), d, Val(N)))
81+
# for abstract arrays, we first have to reorient.
7482
get_1d_ids(d, sz::NTuple{N, Int}, offset, scale) where {N} = get_vec_dim(scale, d, sz) .* (reorient((1:sz[d]), d, Val(N)) .- get_vec_dim(offset, d, sz))
75-
get_1d_ids(d, sz::NTuple{N, Int}, offset) where {N} = (reorient((1:sz[d]), d, Val(N)) .- get_vec_dim(offset, d, sz))
83+
get_1d_ids(d, sz::NTuple{N, Int}, offset::Number) where {N} = (reorient((1:sz[d]) .- get_vec_dim(offset, d, sz), d, Val(N)))
84+
get_1d_ids(d, sz::NTuple{N, Int}, offset) where {N} = reorient(1:sz[d], d, Val(N)) .- get_vec_dim(offset, d, sz)
7685
# get_1d_ids(d, sz, offset, scale) = pick_n(d, scale) .* ((1:sz[d]) .- pick_n(d, offset))
7786
# get_1d_ids(d, sz, offset::NTuple, scale::NTuple) = scale[d] .* ((1:sz[d]) .- offset[d])
7887

@@ -340,6 +349,7 @@ function ChainRulesCore.rrule(config::RuleConfig{>:HasReverseMode}, ::typeof(cal
340349
sca = isnothing(args[2]) ? RAT([one(RT)]) : RT.(args[2])
341350

342351
ids = ntuple((d) -> get_1d_ids(d, sz, off, sca), Val(N)) # offset==args[1] and scale==args[2]
352+
# ids_offset_only = ntuple((d) -> get_1d_ids(d, sz, off), Val(N)) # offset==args[1] and scale==args[2]
343353
ids_offset_only = get_1d_ids.(1:N, Ref(sz), Ref(off)) # , one(eltype(AT))
344354

345355
extra_sz = get_arg_sz(sz, args...)

test/speedtests.jl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ using IndexFunArrays
33
using SeparableFunctions
44
using BenchmarkTools
55
using CUDA
6+
using PlotlyJS
67

78
function speedt_test()
89
sz = (256, 256, 256)
@@ -15,7 +16,7 @@ function speedt_test()
1516
exp(-sum(abs2.((Tuple(ci) .- offset)./sigma)))
1617
end
1718
res = get_exp.(CartesianIndices(sz), Ref(Float32(sqrt(2)).*sigma), Ref(offset));
18-
@btime get_exp.(CartesianIndices($sz), Ref($sigma)); # 47.7 ms (2 allocations, 64 Mb) , but 243 ms with offset!
19+
@btime get_exp.(CartesianIndices($sz), Ref($sigma), Ref(0)); # 47.7 ms (2 allocations, 64 Mb) , but 243 ms with offset!
1920
@btime get_exp.(CartesianIndices($sz), Ref($sigma), Ref(offset)); # 47.7 ms, but 243 ms with offset (7 allocations, 64 Mb)!
2021

2122
res2 = similar(res);
@@ -62,7 +63,6 @@ function speedt_test()
6263
res3_sep = gaussian_sep(typeof(resc), sz, sigma=sigma, offset=offset); #
6364
tc_res2_assign = @belapsed CUDA.@sync $resc .= $res3_sep
6465

65-
using PlotlyJS
6666
method = ["Compute In Place", "Collect Separables", "Lazy Arrays", "Collect Precomputed", "Precompute"]
6767
dat_no_cuda = 1000 .*[t_in_place, t_gaussian_col, t_gaussian_lz, t_res2_assign, t_gaussian_sep]
6868
dat_cuda = 1000 .*[tc_get_exp, tc_gaussian_col, tc_gaussian_lz, tc_res2_assign, tc_gaussian_sep]
@@ -73,7 +73,7 @@ function speedt_test()
7373
], Layout(title="3D Gaussian (512x512x256)", yaxis=attr(title="Time [ms]", type="log"))) # barmode="stack",
7474

7575

76-
# now som speed comparison for propagator_col!
76+
# now some speed comparison for propagator_col!
7777
sz = (1024, 1024)
7878
Δz = 1f0
7979
scale = 0.5f0 ./ (max.(sz ./ 2, 1))

0 commit comments

Comments
 (0)