Skip to content

Commit b03b10f

Browse files
Sébastien LoiselSébastien Loisel
authored andcommitted
Fix CI: use rev=main to get latest commits with sum(MatrixMPI) fix
1 parent 478e1b5 commit b03b10f

23 files changed

Lines changed: 2342 additions & 6 deletions

.github/workflows/CI.yml

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -49,9 +49,10 @@ jobs:
4949
julia --project=. -e '
5050
using Pkg
5151
Pkg.add(name="OpenSSL_jll", version="3.0")
52-
# Install from GitHub (with MPI fixes)
53-
Pkg.add(url="https://github.com/sloisel/LinearAlgebraMPI.jl.git")
54-
Pkg.add(url="https://github.com/sloisel/MultiGridBarrier.jl.git")
52+
# Install from GitHub main branch (with MPI fixes)
53+
# Use rev="main" to get latest commit, not just tagged version
54+
Pkg.add(url="https://github.com/sloisel/LinearAlgebraMPI.jl.git", rev="main")
55+
Pkg.add(url="https://github.com/sloisel/MultiGridBarrier.jl.git", rev="main")
5556
Pkg.instantiate()
5657
Pkg.build("PyCall")
5758
'
@@ -111,9 +112,10 @@ jobs:
111112
using Pkg
112113
Pkg.develop(PackageSpec(path=pwd()))
113114
Pkg.add(name="OpenSSL_jll", version="3.0")
114-
# Install from GitHub (with MPI fixes)
115-
Pkg.add(url="https://github.com/sloisel/LinearAlgebraMPI.jl.git")
116-
Pkg.add(url="https://github.com/sloisel/MultiGridBarrier.jl.git")
115+
# Install from GitHub main branch (with MPI fixes)
116+
# Use rev="main" to get latest commit, not just tagged version
117+
Pkg.add(url="https://github.com/sloisel/LinearAlgebraMPI.jl.git", rev="main")
118+
Pkg.add(url="https://github.com/sloisel/MultiGridBarrier.jl.git", rev="main")
117119
Pkg.instantiate()
118120
Pkg.build("PyCall")
119121
'

tools/benchmark_cpu_vs_gpu.jl

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
#!/usr/bin/env julia
2+
#
3+
# Benchmark: CPU vs Auto (GPU with size threshold) for fem2d_mpi_solve
4+
#
5+
# Run with:
6+
# mpiexec -n 1 julia --project=MultiGridBarrierMPI.jl MultiGridBarrierMPI.jl/tools/benchmark_cpu_vs_gpu.jl
7+
#
8+
# Note: Metal only supports Float32, so we use Float32 for both CPU and GPU
9+
# to ensure a fair comparison.
10+
#
11+
# Two modes:
12+
# - CPU: Pure CPU (no backend parameter)
13+
# - Auto: Automatic GPU/CPU selection based on GPU_MIN_SIZE threshold
14+
15+
using MPI
16+
MPI.Init()
17+
18+
comm = MPI.COMM_WORLD
19+
rank = MPI.Comm_rank(comm)
20+
21+
println("Loading packages...")
22+
using Metal
23+
using MultiGridBarrierMPI
24+
using MultiGridBarrier
25+
using LinearAlgebraMPI
26+
using LinearAlgebraMPI: GPU_MIN_SIZE
27+
using BenchmarkTools
28+
using Printf
29+
30+
# Configurable threshold for "Auto" mode
31+
const AUTO_THRESHOLD = 1000
32+
33+
println("\n" * "="^70)
34+
println("Benchmark: fem2d_mpi_solve - CPU vs Auto")
35+
println(" MPI ranks: $(MPI.Comm_size(comm))")
36+
println(" Element type: Float32 (Metal requirement)")
37+
println(" Auto threshold: GPU_MIN_SIZE = $AUTO_THRESHOLD")
38+
println(" Running L = 1:6")
39+
println("="^70)
40+
41+
# Store results
42+
results = Vector{NamedTuple}()
43+
44+
for L in 1:6
45+
# Get grid size
46+
g = fem2d(Float32; L=L)
47+
n = size(g.x, 1)
48+
49+
println("\n--- L = $L (n = $n) ---")
50+
51+
# Benchmark CPU (pure CPU, no backend)
52+
println(" Benchmarking CPU...")
53+
LinearAlgebraMPI.clear_plan_cache!()
54+
b_cpu = @benchmark fem2d_mpi_solve(Float32; L=$L, verbose=false) samples=1 evals=1
55+
cpu_time = median(b_cpu.times) / 1e9
56+
57+
# Benchmark Auto (GPU_MIN_SIZE threshold)
58+
println(" Benchmarking Auto (threshold=$AUTO_THRESHOLD)...")
59+
LinearAlgebraMPI.clear_plan_cache!()
60+
GPU_MIN_SIZE[] = AUTO_THRESHOLD
61+
b_auto = @benchmark fem2d_mpi_solve(Float32; L=$L, backend=LinearAlgebraMPI.mtl, verbose=false) samples=1 evals=1
62+
auto_time = median(b_auto.times) / 1e9
63+
64+
# Determine which arrays went to GPU in auto mode
65+
GPU_MIN_SIZE[] = AUTO_THRESHOLD
66+
g_test = fem2d_mpi(Float32; L=L, backend=LinearAlgebraMPI.mtl)
67+
auto_is_gpu = !(g_test.x.A isa Matrix)
68+
69+
push!(results, (L=L, n=n, cpu=cpu_time, auto=auto_time, auto_gpu=auto_is_gpu))
70+
71+
# Print results
72+
speedup = cpu_time / auto_time
73+
println(" CPU: $(round(cpu_time, digits=3))s")
74+
println(" Auto: $(round(auto_time, digits=3))s [$(auto_is_gpu ? "GPU" : "CPU")]")
75+
if speedup > 1
76+
println(" Speedup: $(round(speedup, digits=2))x (Auto faster)")
77+
else
78+
println(" Speedup: $(round(1/speedup, digits=2))x (CPU faster)")
79+
end
80+
end
81+
82+
# Summary table
83+
println("\n" * "="^70)
84+
println("Summary")
85+
println("="^70)
86+
println("\n L n CPU Auto Speedup Auto backend")
87+
println(" - - --- ---- ------- ------------")
88+
for r in results
89+
n_str = lpad(r.n, 7)
90+
cpu_str = @sprintf("%6.3fs", r.cpu)
91+
auto_str = @sprintf("%6.3fs", r.auto)
92+
93+
speedup = r.cpu / r.auto
94+
if speedup > 1
95+
speedup_str = @sprintf("%.2fx Auto", speedup)
96+
else
97+
speedup_str = @sprintf("%.2fx CPU", 1/speedup)
98+
end
99+
speedup_str = lpad(speedup_str, 10)
100+
101+
auto_backend = r.auto_gpu ? "GPU" : "CPU"
102+
println(" $(r.L) $n_str $cpu_str $auto_str $speedup_str $auto_backend")
103+
end
104+
105+
println("\n Auto threshold: GPU_MIN_SIZE = $AUTO_THRESHOLD")
106+
println(" Speedup = CPU time / Auto time (>1 means Auto is faster)")
107+
println("="^70)

tools/debug_iteration.jl

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
#!/usr/bin/env julia
2+
using MPI
3+
MPI.Init()
4+
5+
using MultiGridBarrierMPI
6+
using LinearAlgebraMPI
7+
using LinearAlgebraMPI: VectorMPI, MatrixMPI, _local_rows, VectorMPI_local
8+
9+
MultiGridBarrierMPI.Init()
10+
11+
g = fem2d_mpi(Float64; L=6)
12+
x = g.x
13+
w = g.w
14+
15+
println("x type: ", typeof(x))
16+
println("w type: ", typeof(w))
17+
18+
row_iters = (_local_rows(x), _local_rows(w))
19+
println("\nrow_iters types:")
20+
println(" _local_rows(x): ", typeof(row_iters[1]))
21+
println(" _local_rows(w): ", typeof(row_iters[2]))
22+
23+
println("\nFirst few items from zip(row_iters...):")
24+
for (i, items) in enumerate(zip(row_iters...))
25+
println(" $i: types = $(typeof.(items))")
26+
if i >= 3
27+
break
28+
end
29+
end
30+
31+
f = (row_x, w) -> w * sum(row_x)
32+
33+
println("\nTiming comprehension vs loop:")
34+
35+
# Comprehension with zip
36+
t1 = time_ns()
37+
results1 = [f(rows...) for rows in zip(row_iters...)]
38+
t1 = (time_ns() - t1) / 1000
39+
println("Comprehension: $(round(t1, digits=1)) μs, length=$(length(results1))")
40+
41+
# Simple loop
42+
row_iters2 = (_local_rows(x), _local_rows(w))
43+
t2 = time_ns()
44+
results2 = Float64[]
45+
for rows in zip(row_iters2...)
46+
push!(results2, f(rows...))
47+
end
48+
t2 = (time_ns() - t2) / 1000
49+
println("Simple loop: $(round(t2, digits=1)) μs, length=$(length(results2))")
50+
51+
# Direct access loop
52+
local_x = x.A
53+
local_w = w.v
54+
t3 = time_ns()
55+
results3 = Vector{Float64}(undef, size(local_x, 1))
56+
for i in 1:size(local_x, 1)
57+
results3[i] = f(view(local_x, i, :), local_w[i])
58+
end
59+
t3 = (time_ns() - t3) / 1000
60+
println("Direct loop: $(round(t3, digits=1)) μs, length=$(length(results3))")
61+
62+
# Using map with eachrow and vector
63+
t4 = time_ns()
64+
results4 = collect(map(f, eachrow(local_x), local_w))
65+
t4 = (time_ns() - t4) / 1000
66+
println("map(f, eachrow, v): $(round(t4, digits=1)) μs, length=$(length(results4))")
67+
68+
# What about map with _local_rows?
69+
row_iters3 = (_local_rows(x), _local_rows(w))
70+
t5 = time_ns()
71+
results5 = collect(map(f, row_iters3...))
72+
t5 = (time_ns() - t5) / 1000
73+
println("map(f, _local_rows...): $(round(t5, digits=1)) μs, length=$(length(results5))")
74+
75+
# Now time VectorMPI_local
76+
println("\nTiming VectorMPI_local:")
77+
t6 = time_ns()
78+
v = VectorMPI_local(results3)
79+
t6 = (time_ns() - t6) / 1000
80+
println("VectorMPI_local: $(round(t6, digits=1)) μs")
81+
82+
println("\nDone.")

tools/debug_map.jl

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#!/usr/bin/env julia
2+
using MPI
3+
MPI.Init()
4+
5+
using MultiGridBarrierMPI
6+
using LinearAlgebraMPI
7+
using LinearAlgebraMPI: VectorMPI, MatrixMPI, _local_rows
8+
9+
MultiGridBarrierMPI.Init()
10+
11+
g = fem2d_mpi(Float64; L=4)
12+
x = g.x
13+
w = g.w
14+
15+
println("x type: ", typeof(x))
16+
println("w type: ", typeof(w))
17+
18+
row_iters = (_local_rows(x), _local_rows(w))
19+
println("\nrow_iters types:")
20+
println(" _local_rows(x): ", typeof(row_iters[1]))
21+
println(" _local_rows(w): ", typeof(row_iters[2]))
22+
23+
f = (row_x, w) -> w * sum(row_x)
24+
25+
println("\nTrying map(f, row_iters...):")
26+
result = collect(map(f, row_iters...))
27+
println("Result type: ", typeof(result))
28+
println("Result length: ", length(result))
29+
println("First 5 elements: ", result[1:min(5, length(result))])
30+
31+
println("\nTrying with eachrow and vector directly:")
32+
result2 = collect(map(f, eachrow(x.A), w.v))
33+
println("Result2 type: ", typeof(result2))
34+
println("Result2 length: ", length(result2))
35+
36+
println("\nDone.")

tools/julia_profile.jl

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
#!/usr/bin/env julia
2+
# Use Julia's built-in profiler to find bottlenecks
3+
using MPI
4+
MPI.Init()
5+
6+
using Profile
7+
using MultiGridBarrier
8+
using MultiGridBarrierMPI
9+
using LinearAlgebraMPI
10+
11+
MultiGridBarrierMPI.Init()
12+
13+
const L = 6
14+
15+
println("="^70)
16+
println("Julia profiling at L=$L")
17+
println("="^70)
18+
19+
# Create geometry
20+
g_mpi = fem2d_mpi(Float64; L=L)
21+
println("Grid points: ", sum(g_mpi.x.row_partition) - 2)
22+
23+
# Warmup
24+
println("Warmup...")
25+
MultiGridBarrier.amgb(g_mpi; verbose=false, tol=0.1)
26+
27+
# Profile the solve
28+
println("Profiling MPI solve...")
29+
Profile.clear()
30+
@profile MultiGridBarrier.amgb(g_mpi; verbose=false, tol=0.1)
31+
32+
# Print profile
33+
println("\nTop 30 by flat count:")
34+
Profile.print(format=:flat, sortedby=:count, mincount=100, maxdepth=30)
35+
36+
println("\n" * "="^70)
37+
println("Done.")

tools/julia_profile2.jl

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
#!/usr/bin/env julia
2+
# Use Julia's built-in profiler with tree view
3+
using MPI
4+
MPI.Init()
5+
6+
using Profile
7+
using MultiGridBarrier
8+
using MultiGridBarrierMPI
9+
using LinearAlgebraMPI
10+
11+
MultiGridBarrierMPI.Init()
12+
13+
const L = 6
14+
15+
println("="^70)
16+
println("Julia profiling at L=$L (tree view)")
17+
println("="^70)
18+
19+
# Create geometry
20+
g_mpi = fem2d_mpi(Float64; L=L)
21+
println("Grid points: ", sum(g_mpi.x.row_partition) - 2)
22+
23+
# Warmup
24+
println("Warmup...")
25+
MultiGridBarrier.amgb(g_mpi; verbose=false, tol=0.1)
26+
27+
# Profile the solve
28+
println("Profiling MPI solve...")
29+
Profile.clear()
30+
@profile MultiGridBarrier.amgb(g_mpi; verbose=false, tol=0.1)
31+
32+
# Print tree view (collapsed)
33+
println("\nTree view (mincount=200):")
34+
Profile.print(format=:tree, mincount=200, maxdepth=25)
35+
36+
println("\n" * "="^70)
37+
println("Done.")

0 commit comments

Comments
 (0)