11# !/usr/bin/env julia
22#
3- # Benchmark: CPU vs Auto ( GPU with size threshold) for fem2d_mpi_solve
3+ # Benchmark: CPU vs GPU for fem2d_mpi_solve
44#
55# Run with:
6- # mpiexec -n 1 julia --project=MultiGridBarrierMPI.jl MultiGridBarrierMPI.jl/ tools/benchmark_cpu_vs_gpu.jl
6+ # mpiexec -n 1 julia --project=. tools/benchmark_cpu_vs_gpu.jl
77#
88# Note: Metal only supports Float32, so we use Float32 for both CPU and GPU
99# to ensure a fair comparison.
10- #
11- # Two modes:
12- # - CPU: Pure CPU (no backend parameter)
13- # - Auto: Automatic GPU/CPU selection based on GPU_MIN_SIZE threshold
1410
1511using MPI
1612MPI. Init ()
@@ -23,85 +19,72 @@ using Metal
2319using MultiGridBarrierMPI
2420using MultiGridBarrier
2521using LinearAlgebraMPI
26- using LinearAlgebraMPI: GPU_MIN_SIZE
2722using BenchmarkTools
2823using Printf
2924
30- # Configurable threshold for "Auto" mode
31- const AUTO_THRESHOLD = 1000
32-
3325println (" \n " * " =" ^ 70 )
34- println (" Benchmark: fem2d_mpi_solve - CPU vs Auto " )
26+ println (" Benchmark: fem2d_mpi_solve - CPU vs GPU " )
3527println (" MPI ranks: $(MPI. Comm_size (comm)) " )
3628println (" Element type: Float32 (Metal requirement)" )
37- println (" Auto threshold: GPU_MIN_SIZE = $AUTO_THRESHOLD " )
38- println (" Running L = 1:6" )
29+ println (" Running L = 1:7" )
3930println (" =" ^ 70 )
4031
4132# Store results
4233results = Vector {NamedTuple} ()
4334
44- for L in 1 : 6
35+ for L in 1 : 7
4536 # Get grid size
4637 g = fem2d (Float32; L= L)
4738 n = size (g. x, 1 )
4839
4940 println (" \n --- L = $L (n = $n ) ---" )
5041
51- # Benchmark CPU (pure CPU, no backend)
42+ # Benchmark CPU
5243 println (" Benchmarking CPU..." )
5344 LinearAlgebraMPI. clear_plan_cache! ()
5445 b_cpu = @benchmark fem2d_mpi_solve (Float32; L= $ L, verbose= false ) samples= 1 evals= 1
5546 cpu_time = median (b_cpu. times) / 1e9
5647
57- # Benchmark Auto (GPU_MIN_SIZE threshold)
58- println (" Benchmarking Auto (threshold= $AUTO_THRESHOLD ) ..." )
48+ # Benchmark GPU
49+ println (" Benchmarking GPU ..." )
5950 LinearAlgebraMPI. clear_plan_cache! ()
60- GPU_MIN_SIZE[] = AUTO_THRESHOLD
61- b_auto = @benchmark fem2d_mpi_solve (Float32; L= $ L, backend= LinearAlgebraMPI. mtl, verbose= false ) samples= 1 evals= 1
62- auto_time = median (b_auto. times) / 1e9
63-
64- # Determine which arrays went to GPU in auto mode
65- GPU_MIN_SIZE[] = AUTO_THRESHOLD
66- g_test = fem2d_mpi (Float32; L= L, backend= LinearAlgebraMPI. mtl)
67- auto_is_gpu = ! (g_test. x. A isa Matrix)
51+ b_gpu = @benchmark fem2d_mpi_solve (Float32; L= $ L, backend= LinearAlgebraMPI. mtl, verbose= false ) samples= 1 evals= 1
52+ gpu_time = median (b_gpu. times) / 1e9
6853
69- push! (results, (L= L, n= n, cpu= cpu_time, auto = auto_time, auto_gpu = auto_is_gpu ))
54+ push! (results, (L= L, n= n, cpu= cpu_time, gpu = gpu_time ))
7055
7156 # Print results
72- speedup = cpu_time / auto_time
57+ speedup = cpu_time / gpu_time
7358 println (" CPU: $(round (cpu_time, digits= 3 )) s" )
74- println (" Auto: $(round (auto_time , digits= 3 )) s [ $(auto_is_gpu ? " GPU " : " CPU " ) ] " )
59+ println (" GPU: $(round (gpu_time , digits= 3 )) s" )
7560 if speedup > 1
76- println (" Speedup: $(round (speedup, digits= 2 )) x (Auto faster)" )
61+ println (" Speedup: $(round (speedup, digits= 2 )) x (GPU faster)" )
7762 else
78- println (" Speedup : $(round (1 / speedup, digits= 2 )) x (CPU faster)" )
63+ println (" Slowdown : $(round (1 / speedup, digits= 2 )) x (CPU faster)" )
7964 end
8065end
8166
8267# Summary table
8368println (" \n " * " =" ^ 70 )
8469println (" Summary" )
8570println (" =" ^ 70 )
86- println (" \n L n CPU Auto Speedup Auto backend " )
87- println (" - - --- ---- ------- ----- -------" )
71+ println (" \n L n CPU GPU Speedup " )
72+ println (" - - --- --- -------" )
8873for r in results
8974 n_str = lpad (r. n, 7 )
9075 cpu_str = @sprintf (" %6.3fs" , r. cpu)
91- auto_str = @sprintf (" %6.3fs" , r. auto )
76+ gpu_str = @sprintf (" %6.3fs" , r. gpu )
9277
93- speedup = r. cpu / r. auto
78+ speedup = r. cpu / r. gpu
9479 if speedup > 1
95- speedup_str = @sprintf (" %.2fx Auto " , speedup)
80+ speedup_str = @sprintf (" %.2fx GPU " , speedup)
9681 else
9782 speedup_str = @sprintf (" %.2fx CPU" , 1 / speedup)
9883 end
9984 speedup_str = lpad (speedup_str, 10 )
10085
101- auto_backend = r. auto_gpu ? " GPU" : " CPU"
102- println (" $(r. L) $n_str $cpu_str $auto_str $speedup_str $auto_backend " )
86+ println (" $(r. L) $n_str $cpu_str $gpu_str $speedup_str " )
10387end
10488
105- println (" \n Auto threshold: GPU_MIN_SIZE = $AUTO_THRESHOLD " )
106- println (" Speedup = CPU time / Auto time (>1 means Auto is faster)" )
89+ println (" \n Speedup = CPU time / GPU time (>1 means GPU is faster)" )
10790println (" =" ^ 70 )
0 commit comments