|
| 1 | +#!/usr/bin/env julia |
| 2 | +# |
| 3 | +# Benchmark: CPU vs Auto (GPU with size threshold) for fem2d_mpi_solve |
| 4 | +# |
| 5 | +# Run with: |
| 6 | +# mpiexec -n 1 julia --project=MultiGridBarrierMPI.jl MultiGridBarrierMPI.jl/tools/benchmark_cpu_vs_gpu.jl |
| 7 | +# |
| 8 | +# Note: Metal only supports Float32, so we use Float32 for both CPU and GPU |
| 9 | +# to ensure a fair comparison. |
| 10 | +# |
| 11 | +# Two modes: |
| 12 | +# - CPU: Pure CPU (no backend parameter) |
| 13 | +# - Auto: Automatic GPU/CPU selection based on GPU_MIN_SIZE threshold |
| 14 | + |
| 15 | +using MPI |
| 16 | +MPI.Init() |
| 17 | + |
| 18 | +comm = MPI.COMM_WORLD |
| 19 | +rank = MPI.Comm_rank(comm) |
| 20 | + |
| 21 | +println("Loading packages...") |
| 22 | +using Metal |
| 23 | +using MultiGridBarrierMPI |
| 24 | +using MultiGridBarrier |
| 25 | +using LinearAlgebraMPI |
| 26 | +using LinearAlgebraMPI: GPU_MIN_SIZE |
| 27 | +using BenchmarkTools |
| 28 | +using Printf |
| 29 | + |
| 30 | +# Configurable threshold for "Auto" mode |
| 31 | +const AUTO_THRESHOLD = 1000 |
| 32 | + |
| 33 | +println("\n" * "="^70) |
| 34 | +println("Benchmark: fem2d_mpi_solve - CPU vs Auto") |
| 35 | +println(" MPI ranks: $(MPI.Comm_size(comm))") |
| 36 | +println(" Element type: Float32 (Metal requirement)") |
| 37 | +println(" Auto threshold: GPU_MIN_SIZE = $AUTO_THRESHOLD") |
| 38 | +println(" Running L = 1:6") |
| 39 | +println("="^70) |
| 40 | + |
| 41 | +# Store results |
| 42 | +results = Vector{NamedTuple}() |
| 43 | + |
| 44 | +for L in 1:6 |
| 45 | + # Get grid size |
| 46 | + g = fem2d(Float32; L=L) |
| 47 | + n = size(g.x, 1) |
| 48 | + |
| 49 | + println("\n--- L = $L (n = $n) ---") |
| 50 | + |
| 51 | + # Benchmark CPU (pure CPU, no backend) |
| 52 | + println(" Benchmarking CPU...") |
| 53 | + LinearAlgebraMPI.clear_plan_cache!() |
| 54 | + b_cpu = @benchmark fem2d_mpi_solve(Float32; L=$L, verbose=false) samples=1 evals=1 |
| 55 | + cpu_time = median(b_cpu.times) / 1e9 |
| 56 | + |
| 57 | + # Benchmark Auto (GPU_MIN_SIZE threshold) |
| 58 | + println(" Benchmarking Auto (threshold=$AUTO_THRESHOLD)...") |
| 59 | + LinearAlgebraMPI.clear_plan_cache!() |
| 60 | + GPU_MIN_SIZE[] = AUTO_THRESHOLD |
| 61 | + b_auto = @benchmark fem2d_mpi_solve(Float32; L=$L, backend=LinearAlgebraMPI.mtl, verbose=false) samples=1 evals=1 |
| 62 | + auto_time = median(b_auto.times) / 1e9 |
| 63 | + |
| 64 | + # Determine which arrays went to GPU in auto mode |
| 65 | + GPU_MIN_SIZE[] = AUTO_THRESHOLD |
| 66 | + g_test = fem2d_mpi(Float32; L=L, backend=LinearAlgebraMPI.mtl) |
| 67 | + auto_is_gpu = !(g_test.x.A isa Matrix) |
| 68 | + |
| 69 | + push!(results, (L=L, n=n, cpu=cpu_time, auto=auto_time, auto_gpu=auto_is_gpu)) |
| 70 | + |
| 71 | + # Print results |
| 72 | + speedup = cpu_time / auto_time |
| 73 | + println(" CPU: $(round(cpu_time, digits=3))s") |
| 74 | + println(" Auto: $(round(auto_time, digits=3))s [$(auto_is_gpu ? "GPU" : "CPU")]") |
| 75 | + if speedup > 1 |
| 76 | + println(" Speedup: $(round(speedup, digits=2))x (Auto faster)") |
| 77 | + else |
| 78 | + println(" Speedup: $(round(1/speedup, digits=2))x (CPU faster)") |
| 79 | + end |
| 80 | +end |
| 81 | + |
| 82 | +# Summary table |
| 83 | +println("\n" * "="^70) |
| 84 | +println("Summary") |
| 85 | +println("="^70) |
| 86 | +println("\n L n CPU Auto Speedup Auto backend") |
| 87 | +println(" - - --- ---- ------- ------------") |
| 88 | +for r in results |
| 89 | + n_str = lpad(r.n, 7) |
| 90 | + cpu_str = @sprintf("%6.3fs", r.cpu) |
| 91 | + auto_str = @sprintf("%6.3fs", r.auto) |
| 92 | + |
| 93 | + speedup = r.cpu / r.auto |
| 94 | + if speedup > 1 |
| 95 | + speedup_str = @sprintf("%.2fx Auto", speedup) |
| 96 | + else |
| 97 | + speedup_str = @sprintf("%.2fx CPU", 1/speedup) |
| 98 | + end |
| 99 | + speedup_str = lpad(speedup_str, 10) |
| 100 | + |
| 101 | + auto_backend = r.auto_gpu ? "GPU" : "CPU" |
| 102 | + println(" $(r.L) $n_str $cpu_str $auto_str $speedup_str $auto_backend") |
| 103 | +end |
| 104 | + |
| 105 | +println("\n Auto threshold: GPU_MIN_SIZE = $AUTO_THRESHOLD") |
| 106 | +println(" Speedup = CPU time / Auto time (>1 means Auto is faster)") |
| 107 | +println("="^70) |
0 commit comments