|
4 | 4 | - Cluster coming up at 4:35pm Mountain, $320/hour |
5 | 5 | - Cluster coming down at 6:00pm |
6 | 6 |
|
7 | | -I'm not running OSU all reduce or quicksilver. |
8 | | - |
9 | 7 | ## Experiment |
10 | 8 |
|
11 | 9 | Shell in: |
@@ -270,108 +268,37 @@ mkdir -p $output |
270 | 268 | # We should do H H - better values across the board |
271 | 269 | ./flux-run-combinations.sh 16 $app |
272 | 270 |
|
273 | | -# D D and H H errors (bad results but we ran anyway): |
274 | | -# The call to cuMemHostRegister(0x78407fe00008, 134217728, 0) failed. |
275 | | -# Host: flux-004 |
276 | | -# cuMemHostRegister return value: 1 |
277 | | -# Registration cache: smcuda |
278 | | - |
279 | | -# Note that osu_latency had worse values with D D. H H seems better across the board. |
280 | | -cho "Running iteration $i" |
281 | | - |
282 | | -# -d cuda H H/D D slowest and has errors for allreduce |
283 | | - |
284 | 271 | # These were run separately |
285 | 272 | export app=osu-allreduce |
286 | 273 | export output=results/$app |
287 | 274 | mkdir -p $output |
288 | 275 |
|
289 | | -# I skipped these for now because we need to debug the GPU issue, don't |
290 | | -# want to spend the money credits on crappy results |
291 | | -# confirmed using all 8 gpu, but just a little, mostly memory (~312MiB) |
292 | | -for i in $(seq 2 2); do |
293 | | - |
294 | | -# original command for 4, 2m 36 seconds |
295 | | -time flux run -opmi=pmix -N 8 -n 64 -g 1 -o cpu-affinity=per-task -o gpu-affinity=per-task \ |
296 | | - --setattr=user.study_id=$app-4-DD-iter-$i \ |
297 | | - singularity exec --nv --bind /usr/local/cuda /opt/containers/metric-osu-gpu_google-gpu.sif \ |
298 | | - bash -c "ulimit -m 9999999999 ; /opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce -d cuda D D" |
299 | | - |
300 | | -# 2m 41 seconds |
301 | | -time flux run -opmi=pmix -N 4 -n 32 -g 1 --setattr=user.study_id=$app-4-HH-iter-$i -o cpu-affinity=per-task -o gpu-affinity=per-task \ |
302 | | - singularity exec --nv --bind /usr/local/cuda /opt/containers/metric-osu-gpu_google-gpu.sif \ |
303 | | - /opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce -d cuda H H |
304 | | - |
305 | | -# 2m 19 seconds |
306 | | -time flux run -opmi=pmix -N 4 -n 32 -g 1 --setattr=user.study_id=$app-4-iter-$i -o cpu-affinity=per-task -o gpu-affinity=per-task \ |
307 | | - singularity exec --nv --bind /usr/local/cuda /opt/containers/metric-osu-gpu_google-gpu.sif \ |
308 | | - /opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce |
| 276 | +# fastest with D D and the OMPI envar. |
| 277 | +for i in $(seq 1 5); do |
| 278 | + time flux run --env OMPI_COMM_WORLD_LOCAL_RANK=0 --env OMPI_MCA_pml=ucx --env OMPI_MCA_btl=tcp -N 16 -n 128 -g 1 --setattr=user.study_id=$app-16-iter-$i -o cpu-affinity=per-task -o gpu-affinity=per-task /opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce -d cuda H H |
309 | 279 | done |
310 | 280 |
|
311 | | -# Not tested yet! There are still errors (and much slower times) with any cuda flags |
312 | | -sflux run --setattr=user.study_id=$app-8-iter-$i -N 8 -n 64 -g 1 -o cpu-affinity=per-task -o gpu-affinity=per-task \ |
313 | | -singularity exec --nv /opt/containers/metric-osu-gpu_google-gpu.sif \ |
314 | | -/opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce |
315 | | - |
316 | | -flux run --setattr=user.study_id=$app-16-iter-$i -N 16 -n 128 -g 1 -o cpu-affinity=per-task -o gpu-affinity=per-task \ |
317 | | -singularity exec --nv /opt/containers/metric-osu-gpu_google-gpu.sif \ |
318 | | -/opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce |
319 | | - |
320 | | -flux run --setattr=user.study_id=$app-32-iter-$i -N 32 -n 256 -g 1 -o cpu-affinity=per-task -o gpu-affinity=per-task \ |
321 | | -singularity exec --nv /opt/containers/metric-osu-gpu_google-gpu.sif \ |
322 | | -/opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce |
323 | | - |
324 | 281 | # When they are done: |
325 | 282 | ./save.sh $output |
326 | 283 | oras push ghcr.io/converged-computing/metrics-operator-experiments/performance:compute-engine-gpu-16-$app $output |
327 | 284 | ``` |
328 | 285 |
|
329 | 286 | #### Quicksilver |
330 | 287 |
|
331 | | -Testing: |
332 | | - |
333 | | -```console |
334 | | -# This is the only app that didn't run (I tried a lot of different configs) |
335 | | -# The call to cuMemHostRegister(0x7fbb82200008, 134217728, 0) failed. |
336 | | -# Host: flux-004 |
337 | | -# cuMemHostRegister return value: 1 |
338 | | -# Registration cache: smcuda |
339 | | - |
340 | | -# testing smcuda snake error |
341 | | -flux run -opmi=pmix -o gpu-affinity=per-task --env OMP_NUM_THREADS=1 -o cpu-affinity=per-task -N2 -n 16 -g 1 singularity exec --nv /opt/containers/metric-quicksilver-gpu_google-gpu.sif qs --inputFile /opt/quicksilver/Examples/CORAL2_Benchmark/Problem1/Coral2_P1.inp -X 32 -Y 32 -Z 16 -x 32 -y 32 -z 16 -I 4 -J 2 -K 2 -n 26214400 |
342 | | - |
343 | | -# only works on one node, ssh is not allowed |
344 | | -mpirun -n 8 --map-by ppr:8:node singularity exec --nv /opt/containers/metric-quicksilver-gpu_google-gpu.sif qs --inputFile /opt/quicksilver/Examples/CORAL2_Benchmark/Problem1/Coral2_P1.inp -X 16 -Y 16 -Z 16 -x 16 -y 16 -z 16 -I 4 -J 4 -K 2 -n 163840 |
345 | | - |
346 | | -time flux run -o gpu-affinity=per-task -o cpu-affinity=per-task --exclusive --env OMP_NUM_THREADS=1 -N2 -n 16 -g 1 singularity exec --nv /opt/containers/metric-quicksilver-gpu_google-gpu.sif qs --inputFile /opt/quicksilver/Examples/CORAL2_Benchmark/Problem1/Coral2_P1.inp -X 32 -Y 32 -Z 16 -x 32 -y 32 -z 16 -I 4 -J 4 -K 2 -n 26214400 |
347 | | -``` |
348 | | - |
349 | | -Run attempt: |
350 | | - |
351 | 288 | ```console |
352 | 289 | export app=quicksilver |
353 | 290 | export output=results/$app |
354 | 291 | mkdir -p $output |
355 | 292 |
|
356 | | -# Error: |
357 | | -# -------------------------------------------------------------------------- |
358 | | -# The call to cuMemHostRegister(0x7e21e1c00008, 134217728, 0) failed. |
359 | | -# Host: flux-001 |
360 | | -# cuMemHostRegister return value: 1 |
361 | | -# Registration cache: smcuda |
362 | | -# -------------------------------------------------------------------------- |
363 | | - |
364 | | -# confirmed using all 8 GPU, 100%, despite error above |
365 | | -# Allowing 10 minutes to see output, and if none, cancelling. |
| 293 | +# confirmed using all 8 GPU, 100% |
| 294 | +# Allowing 15 minutes to run then cancel |
366 | 295 | for i in $(seq 1 1); do |
367 | 296 | echo "Running iteration $i" |
368 | | - # Try this and see if completes |
369 | | - time flux run -o gpu-affinity=per-task -o cpu-affinity=per-task --exclusive --env OMP_NUM_THREADS=1 --setattr=user.study_id=$app-4-iter-$i -N4 -n 32 -g 1 singularity exec --nv /opt/containers/metric-quicksilver-gpu_google-gpu.sif qs --inputFile /opt/quicksilver/Examples/CORAL2_Benchmark/Problem1/Coral2_P1.inp -X 32 -Y 32 -Z 32 -x 32 -y 32 -z 32 -I 4 -J 4 -K 2 -n 52428800 |
| 297 | + time flux run --exclusive --env OMP_NUM_THREADS=1 --env OMPI_MCA_pml=ucx --env OMPI_MCA_btl=tcp -N 16 -n 128 -g 1 --setattr=user.study_id=$app-16-iter-$i -o cpu-affinity=per-task -o gpu-affinity=per-task qs --inputFile /opt/quicksilver/Examples/CORAL2_Benchmark/Problem1/Coral2_P1.inp -X 64 -Y 64 -Z 32 -x 64 -y 64 -z 32 -I 8 -J 4 -K 4 -n 209715200 |
370 | 298 | done |
371 | 299 |
|
372 | 300 | # When they are done: |
373 | 301 | ./save.sh $output |
374 | | - |
375 | 302 | oras push ghcr.io/converged-computing/metrics-operator-experiments/performance:compute-engine-gpu-16-$app $output |
376 | 303 | ``` |
377 | 304 |
|
|
0 commit comments