Skip to content

Commit 7401973

Browse files
committed
compute-engine gpu: add qs and allreduce runs
Signed-off-by: vsoch <vsoch@users.noreply.github.com>
1 parent 24a1c01 commit 7401973

34 files changed

Lines changed: 1348 additions & 328 deletions

README.md

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,6 @@ This study will test HPC application performance across three clouds. The reposi
1111
- [Amazon Web Services](experiments/aws) includes Parallel Cluster (EC2), and EKS (KUbernetes) for each of CPU and GPU
1212
- [Microsoft Azure](experiments/azure) includes CycleCloud (VMs), and AKS (Kubernetes) for each of CPU and GPU.
1313

14-
## Timing
15-
16-
This is a checklist for the setups we have tested and timed:
17-
1814
## Experiments
1915

2016
### "Bare Metal"
@@ -44,14 +40,14 @@ This is a checklist for the setups we have tested and timed:
4440
- [x] size 64 (vsoch done 8/26/2024)
4541
- [x] size 128 (vsoch done 8/27/2024)
4642
- [x] size 256 (vsoch done 8/27/2024)
47-
- [ ] Google Compute Engine GPU
43+
- [x] Google Compute Engine GPU
4844
- done on llnl-flux
4945
- [x] New VM and automation needed with Terraform (vsoch, early 9/2024)
5046
- [x] size 4 (vsoch 9/6/2024)
5147
- [x] size 8 (vsoch 9/7/2024)
5248
- [x] size 16 (vsoch 9/8/2024)
5349
- [x] size 32 (vsoch 9/8/2024)
54-
- [ ] quicksilver and osu all reduce need runs at all sizes.
50+
- [x] quicksilver and osu all reduce need runs at all sizes (vsoch 9/9/2024)
5551

5652
### Kubernetes
5753

experiments/google/compute-engine/gpu/build-images/startup-script.sh

Lines changed: 63 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -104,20 +104,30 @@ cd flux-sched
104104
./configure --prefix=/usr --sysconfdir=/etc
105105
make -j 8 && sudo make install && sudo ldconfig
106106

107-
# install openmpi with cuda
107+
# Note that UCX (and a rebuild of open mpi) was done after to get OSU/quicksilver working
108+
109+
cd /opt
110+
sudo git clone https://github.com/openucx/ucx && \
111+
sudo chown -R $USER ./ucx && cd ucx/ && \
112+
git clean -xfd && \
113+
./autogen.sh && mkdir build && cd build && \
114+
../configure --prefix=/usr --enable-debug --with-cuda=/usr/local/cuda --enable-mt --disable-cma && \
115+
make -j && sudo make install
116+
117+
# If already existed - remove
118+
sudo rm -rf /usr/local/pancakes/
119+
120+
# install openmpi with cuda and ucx
108121
cd /opt
109122
sudo mkdir -p /usr/local/pancakes && \
110-
sudo wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.2.tar.gz && \
123+
sudo wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.2.tar.gz || true && \
111124
sudo tar -xzvf openmpi-4.1.2.tar.gz && \
112125
cd openmpi-4.1.2 && \
113126
sudo chown -R $USER $(pwd) && \
114-
./configure --with-cuda --prefix=/usr/local/pancakes && \
115-
make -j 20 && sudo make install
116-
117-
# TODO check these, should be provided in flux environment later
118-
# ENV CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
119-
# ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/pancakes/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
120-
# ENV LD_LIBRARY_PATH=/usr/local/pancakes/lib:/opt/miniconda/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
127+
make distclean || true && \
128+
mkdir build && cd build
129+
../configure --with-cuda=/usr/local/cuda --with-ucx=/usr/ --prefix=/usr/local/pancakes
130+
make -j && sudo make install
121131

122132
cd /opt
123133

@@ -384,3 +394,47 @@ sudo apt-get install -y --no-install-recommends --allow-change-held-packages apt
384394
sudo echo "deb https://packages.cloud.google.com/apt google-fast-socket main" | tee /etc/apt/sources.list.d/google-fast-socket.list
385395
curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add
386396
sudo apt update && sudo apt install -y --no-install-recommends google-fast-socket=0.0.5
397+
398+
399+
# Install additional apps for bare metal, osu and quicksilver and multi-gpu-models
400+
54 make -j
401+
55 make install
402+
56 cd /opt/osu-benchmark/
403+
57 rm -rf build.openmpi/
404+
58 export OSU_VERSION=5.8
405+
59 mkdir -p build.openmpi && cd build.openmpi && ../src/osu-micro-benchmarks-${OSU_VERSION}/configure CC=mpicc CXX=mpicxx CFLAGS=-I$(pwd)/../src/osu-micro-benchmarks-${OSU_VERSION}/util --prefix=$(pwd) --enable-cuda --with-cuda=/usr/local/cuda && make && make install
406+
60 export PATH=/usr/local/pancakes/bin:$PATH
407+
61 make && make install
408+
62 cd /opt/containers/
409+
63 cd /root
410+
411+
# OSU benchmarks
412+
sudo git clone --depth 1 https://github.com/ULHPC/tutorials /opt/tutorials && \
413+
sudo mkdir -p /opt/osu-benchmark && \
414+
sudo chown -R $USER /opt/tutorials /opt/osu-benchmark && \
415+
cd /opt/osu-benchmark && \
416+
ln -s /opt/tutorials/parallel/mpi/OSU_MicroBenchmarks ref.d && \
417+
ln -s ref.d/Makefile . && \
418+
ln -s ref.d/scripts . && \
419+
mkdir src && \
420+
cd src && \
421+
export OSU_VERSION=5.8 && \
422+
wget --no-check-certificate http://mvapich.cse.ohio-state.edu/download/mvapich/osu-micro-benchmarks-${OSU_VERSION}.tgz && \
423+
tar xf osu-micro-benchmarks-${OSU_VERSION}.tgz && \
424+
cd /opt/osu-benchmark && \
425+
# Compile based on openmpi with cuda/ucx
426+
mkdir -p build.openmpi && cd build.openmpi && \
427+
../src/osu-micro-benchmarks-${OSU_VERSION}/configure CC=mpicc CXX=mpicxx CFLAGS=-I$(pwd)/../src/osu-micro-benchmarks-${OSU_VERSION}/util --prefix=$(pwd) --enable-cuda --with-cuda=/usr/local/cuda && \
428+
make && make install
429+
430+
# Quicksilver
431+
sudo git clone https://github.com/LLNL/Quicksilver quicksilver
432+
sudo chown -R $USER /opt/quicksilver
433+
wget https://raw.githubusercontent.com/converged-computing/performance-study/main/docker/google/gpu/quicksilver/Makefile
434+
cd /opt/quicksilver/src
435+
make || nvcc -DHAVE_CUDA -std=c++11 -O2 -Xptxas -v -gencode=arch=compute_70,code=\"sm_70,compute_70\" --compiler-bindir=/usr/local/pancakes/bin/mpicxx -L/usr/local/cuda/lib64/ -lcuda -lcudart -lm -o qs CollisionEvent.o CoralBenchmark.o CycleTracking.o DecompositionObject.o DirectionCosine.o EnergySpectrum.o GlobalFccGrid.o GridAssignmentObject.o InputBlock.o MCT.o MC_Adjacent_Facet.o MC_Base_Particle.o MC_Domain.o MC_Facet_Crossing_Event.o MC_Fast_Timer.o MC_Load_Particle.o MC_Location.o MC_Particle_Buffer.o MC_RNG_State.o MC_Segment_Outcome.o MC_SourceNow.o MacroscopicCrossSection.o MeshPartition.o MonteCarlo.o MpiCommObject.o NuclearData.o Parameters.o ParticleVault.o ParticleVaultContainer.o PopulationControl.o SendQueue.o SharedMemoryCommObject.o Tallies.o cmdLineParser.o cudaFunctions.o initMC.o main.o parseUtils.o utils.o utilsMpi.o && sudo cp qs /usr/bin/qs
436+
437+
# Multi-gpu-models
438+
sudo git clone https://github.com/NVIDIA/multi-gpu-programming-models /opt/multi-gpu-programming-models && \
439+
cd multi-gpu-programming-models/mpi && \
440+
make && sudo mv jacobi /usr/local/bin

experiments/google/compute-engine/gpu/debug/README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,12 @@ And from the outside and within the container.
1616
strace -f flux run -opmi=pmix --env OMPI_COMM_WORLD_LOCAL_RANK=0 -N 1 -n 8 -g 1 -o cpu-affinity=per-task -o gpu-affinity=per-task singularity exec --nv --bind /usr/local/cuda /opt/containers/metric-osu-gpu_google-gpu.sif /bin/bash -c "/opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce -d cuda H H" 2> flux-singularity-trace-f.txt
1717
```
1818

19+
### [flux-singularity-ltrace-f.txt](flux-singularity-ltrace-f.txt)
20+
21+
```bash
22+
ltrace -f flux run -opmi=pmix --env OMPI_COMM_WORLD_LOCAL_RANK=0 -N 1 -n 8 -g 1 -o cpu-affinity=per-task -o gpu-affinity=per-task singularity exec --nv --bind /usr/local/cuda /opt/containers/metric-osu-gpu_google-gpu.sif /bin/bash -c "/opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce -d cuda H H" 2> flux-singularity-ltrace-f.txt
23+
```
24+
1925
### [flux-singularity-trace-s-f.txt](flux-singularity-trace-s-f.txt)
2026

2127
```bash

experiments/google/compute-engine/gpu/size16/README.md

Lines changed: 6 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,6 @@
44
- Cluster coming up at 4:35pm Mountain, $320/hour
55
- Cluster coming down at 6:00pm
66

7-
I'm not running OSU all reduce or quicksilver.
8-
97
## Experiment
108

119
Shell in:
@@ -270,108 +268,37 @@ mkdir -p $output
270268
# We should do H H - better values across the board
271269
./flux-run-combinations.sh 16 $app
272270

273-
# D D and H H errors (bad results but we ran anyway):
274-
# The call to cuMemHostRegister(0x78407fe00008, 134217728, 0) failed.
275-
# Host: flux-004
276-
# cuMemHostRegister return value: 1
277-
# Registration cache: smcuda
278-
279-
# Note that osu_latency had worse values with D D. H H seems better across the board.
280-
cho "Running iteration $i"
281-
282-
# -d cuda H H/D D slowest and has errors for allreduce
283-
284271
# These were run separately
285272
export app=osu-allreduce
286273
export output=results/$app
287274
mkdir -p $output
288275

289-
# I skipped these for now because we need to debug the GPU issue, don't
290-
# want to spend the money credits on crappy results
291-
# confirmed using all 8 gpu, but just a little, mostly memory (~312MiB)
292-
for i in $(seq 2 2); do
293-
294-
# original command for 4, 2m 36 seconds
295-
time flux run -opmi=pmix -N 8 -n 64 -g 1 -o cpu-affinity=per-task -o gpu-affinity=per-task \
296-
--setattr=user.study_id=$app-4-DD-iter-$i \
297-
singularity exec --nv --bind /usr/local/cuda /opt/containers/metric-osu-gpu_google-gpu.sif \
298-
bash -c "ulimit -m 9999999999 ; /opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce -d cuda D D"
299-
300-
# 2m 41 seconds
301-
time flux run -opmi=pmix -N 4 -n 32 -g 1 --setattr=user.study_id=$app-4-HH-iter-$i -o cpu-affinity=per-task -o gpu-affinity=per-task \
302-
singularity exec --nv --bind /usr/local/cuda /opt/containers/metric-osu-gpu_google-gpu.sif \
303-
/opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce -d cuda H H
304-
305-
# 2m 19 seconds
306-
time flux run -opmi=pmix -N 4 -n 32 -g 1 --setattr=user.study_id=$app-4-iter-$i -o cpu-affinity=per-task -o gpu-affinity=per-task \
307-
singularity exec --nv --bind /usr/local/cuda /opt/containers/metric-osu-gpu_google-gpu.sif \
308-
/opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce
276+
# fastest with D D and the OMPI envar.
277+
for i in $(seq 1 5); do
278+
time flux run --env OMPI_COMM_WORLD_LOCAL_RANK=0 --env OMPI_MCA_pml=ucx --env OMPI_MCA_btl=tcp -N 16 -n 128 -g 1 --setattr=user.study_id=$app-16-iter-$i -o cpu-affinity=per-task -o gpu-affinity=per-task /opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce -d cuda H H
309279
done
310280

311-
# Not tested yet! There are still errors (and much slower times) with any cuda flags
312-
sflux run --setattr=user.study_id=$app-8-iter-$i -N 8 -n 64 -g 1 -o cpu-affinity=per-task -o gpu-affinity=per-task \
313-
singularity exec --nv /opt/containers/metric-osu-gpu_google-gpu.sif \
314-
/opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce
315-
316-
flux run --setattr=user.study_id=$app-16-iter-$i -N 16 -n 128 -g 1 -o cpu-affinity=per-task -o gpu-affinity=per-task \
317-
singularity exec --nv /opt/containers/metric-osu-gpu_google-gpu.sif \
318-
/opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce
319-
320-
flux run --setattr=user.study_id=$app-32-iter-$i -N 32 -n 256 -g 1 -o cpu-affinity=per-task -o gpu-affinity=per-task \
321-
singularity exec --nv /opt/containers/metric-osu-gpu_google-gpu.sif \
322-
/opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce
323-
324281
# When they are done:
325282
./save.sh $output
326283
oras push ghcr.io/converged-computing/metrics-operator-experiments/performance:compute-engine-gpu-16-$app $output
327284
```
328285

329286
#### Quicksilver
330287

331-
Testing:
332-
333-
```console
334-
# This is the only app that didn't run (I tried a lot of different configs)
335-
# The call to cuMemHostRegister(0x7fbb82200008, 134217728, 0) failed.
336-
# Host: flux-004
337-
# cuMemHostRegister return value: 1
338-
# Registration cache: smcuda
339-
340-
# testing smcuda snake error
341-
flux run -opmi=pmix -o gpu-affinity=per-task --env OMP_NUM_THREADS=1 -o cpu-affinity=per-task -N2 -n 16 -g 1 singularity exec --nv /opt/containers/metric-quicksilver-gpu_google-gpu.sif qs --inputFile /opt/quicksilver/Examples/CORAL2_Benchmark/Problem1/Coral2_P1.inp -X 32 -Y 32 -Z 16 -x 32 -y 32 -z 16 -I 4 -J 2 -K 2 -n 26214400
342-
343-
# only works on one node, ssh is not allowed
344-
mpirun -n 8 --map-by ppr:8:node singularity exec --nv /opt/containers/metric-quicksilver-gpu_google-gpu.sif qs --inputFile /opt/quicksilver/Examples/CORAL2_Benchmark/Problem1/Coral2_P1.inp -X 16 -Y 16 -Z 16 -x 16 -y 16 -z 16 -I 4 -J 4 -K 2 -n 163840
345-
346-
time flux run -o gpu-affinity=per-task -o cpu-affinity=per-task --exclusive --env OMP_NUM_THREADS=1 -N2 -n 16 -g 1 singularity exec --nv /opt/containers/metric-quicksilver-gpu_google-gpu.sif qs --inputFile /opt/quicksilver/Examples/CORAL2_Benchmark/Problem1/Coral2_P1.inp -X 32 -Y 32 -Z 16 -x 32 -y 32 -z 16 -I 4 -J 4 -K 2 -n 26214400
347-
```
348-
349-
Run attempt:
350-
351288
```console
352289
export app=quicksilver
353290
export output=results/$app
354291
mkdir -p $output
355292

356-
# Error:
357-
# --------------------------------------------------------------------------
358-
# The call to cuMemHostRegister(0x7e21e1c00008, 134217728, 0) failed.
359-
# Host: flux-001
360-
# cuMemHostRegister return value: 1
361-
# Registration cache: smcuda
362-
# --------------------------------------------------------------------------
363-
364-
# confirmed using all 8 GPU, 100%, despite error above
365-
# Allowing 10 minutes to see output, and if none, cancelling.
293+
# confirmed using all 8 GPU, 100%
294+
# Allowing 15 minutes to run then cancel
366295
for i in $(seq 1 1); do
367296
echo "Running iteration $i"
368-
# Try this and see if completes
369-
time flux run -o gpu-affinity=per-task -o cpu-affinity=per-task --exclusive --env OMP_NUM_THREADS=1 --setattr=user.study_id=$app-4-iter-$i -N4 -n 32 -g 1 singularity exec --nv /opt/containers/metric-quicksilver-gpu_google-gpu.sif qs --inputFile /opt/quicksilver/Examples/CORAL2_Benchmark/Problem1/Coral2_P1.inp -X 32 -Y 32 -Z 32 -x 32 -y 32 -z 32 -I 4 -J 4 -K 2 -n 52428800
297+
time flux run --exclusive --env OMP_NUM_THREADS=1 --env OMPI_MCA_pml=ucx --env OMPI_MCA_btl=tcp -N 16 -n 128 -g 1 --setattr=user.study_id=$app-16-iter-$i -o cpu-affinity=per-task -o gpu-affinity=per-task qs --inputFile /opt/quicksilver/Examples/CORAL2_Benchmark/Problem1/Coral2_P1.inp -X 64 -Y 64 -Z 32 -x 64 -y 64 -z 32 -I 8 -J 4 -K 4 -n 209715200
370298
done
371299

372300
# When they are done:
373301
./save.sh $output
374-
375302
oras push ghcr.io/converged-computing/metrics-operator-experiments/performance:compute-engine-gpu-16-$app $output
376303
```
377304

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
58.145s: job.exception type=cancel severity=0 interrupted by ctrl-C
2+
58.552s: job.exception type=cancel severity=0 interrupted by ctrl-C
3+
flux-job: task(s) Terminated
4+
5+
# OSU MPI-CUDA Allreduce Latency Test v5.8
6+
# Size Avg Latency(us)
7+
4 337.07
8+
8 333.06
9+
16 334.79
10+
32 340.88
11+
64 333.40
12+
128 341.31
13+
256 343.44
14+
512 356.03
15+
1024 373.93
16+
2048 411.85
17+
4096 451.33
18+
8192 746.66
19+
16384 1038.96
20+
32768 1323.51
21+
65536 1420.76
22+
131072 2002.24
23+
262144 1318.68
24+
524288 2188.49
25+
1048576 3982.10
26+
START OF JOBSPEC
27+
{"resources": [{"type": "node", "count": 16, "with": [{"type": "slot", "count": 8, "with": [{"type": "core", "count": 1}, {"type": "gpu", "count": 1}], "label": "task"}]}], "tasks": [{"command": ["/opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce", "-d", "cuda", "H", "H"], "slot": "task", "count": {"per_slot": 1}}], "attributes": {"system": {"duration": 0, "cwd": "/home/sochat1_llnl_gov", "shell": {"options": {"rlimit": {"cpu": -1, "fsize": -1, "data": -1, "stack": 8388608, "core": 0, "nofile": 1048576, "as": -1, "rss": -1, "nproc": -1}, "cpu-affinity": "per-task", "gpu-affinity": "per-task"}}}, "user": {"study_id": "osu-allreduce-16-iter-1"}}, "version": 1}
28+
START OF EVENTLOG
29+
{"timestamp":1725922820.0990932,"name":"init"}
30+
{"timestamp":1725922820.1000268,"name":"starting"}
31+
{"timestamp":1725922820.779249,"name":"shell.init","context":{"service":"501043911-shell-f2cZAePdh","leader-rank":0,"size":16}}
32+
{"timestamp":1725922820.9687486,"name":"shell.start","context":{"taskmap":{"version":1,"map":[[0,16,8,1]]}}}
33+
{"timestamp":1725922878.296582,"name":"shell.task-exit","context":{"localid":1,"rank":65,"state":"Exited","pid":1487,"wait_status":15,"signaled":15,"exitcode":143}}
34+
{"timestamp":1725922878.8372781,"name":"complete","context":{"status":36608}}
35+
{"timestamp":1725922878.837321,"name":"done"}
36+
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
23.953s: job.exception type=cancel severity=0 interrupted by ctrl-C
2+
24.155s: job.exception type=cancel severity=0 interrupted by ctrl-C
3+
24.348s: job.exception type=cancel severity=0 interrupted by ctrl-C
4+
flux-job: task(s) Terminated
5+
6+
# OSU MPI-CUDA Allreduce Latency Test v5.8
7+
# Size Avg Latency(us)
8+
4 346.93
9+
8 343.36
10+
16 345.98
11+
32 345.99
12+
64 345.13
13+
128 343.23
14+
256 359.02
15+
512 366.02
16+
1024 379.76
17+
2048 423.79
18+
4096 467.01
19+
8192 738.21
20+
16384 1113.73
21+
32768 1372.31
22+
65536 1449.55
23+
131072 2002.28
24+
262144 1318.81
25+
524288 2209.33
26+
1048576 4025.37
27+
START OF JOBSPEC
28+
{"resources": [{"type": "node", "count": 16, "with": [{"type": "slot", "count": 8, "with": [{"type": "core", "count": 1}, {"type": "gpu", "count": 1}], "label": "task"}]}], "tasks": [{"command": ["/opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce", "-d", "cuda", "H", "H"], "slot": "task", "count": {"per_slot": 1}}], "attributes": {"system": {"duration": 0, "cwd": "/home/sochat1_llnl_gov", "shell": {"options": {"rlimit": {"cpu": -1, "fsize": -1, "data": -1, "stack": 8388608, "core": 0, "nofile": 1048576, "as": -1, "rss": -1, "nproc": -1}, "cpu-affinity": "per-task", "gpu-affinity": "per-task"}}}, "user": {"study_id": "osu-allreduce-16-iter-2"}}, "version": 1}
29+
START OF EVENTLOG
30+
{"timestamp":1725922887.4867351,"name":"init"}
31+
{"timestamp":1725922887.4880421,"name":"starting"}
32+
{"timestamp":1725922887.6449196,"name":"shell.init","context":{"service":"501043911-shell-f38FgPx4w","leader-rank":0,"size":16}}
33+
{"timestamp":1725922887.6735122,"name":"shell.start","context":{"taskmap":{"version":1,"map":[[0,16,8,1]]}}}
34+
{"timestamp":1725922911.4788885,"name":"shell.task-exit","context":{"localid":0,"rank":16,"state":"Exited","pid":1540,"wait_status":15,"signaled":15,"exitcode":143}}
35+
{"timestamp":1725922911.8023748,"name":"complete","context":{"status":36608}}
36+
{"timestamp":1725922911.8023956,"name":"done"}
37+
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
28.598s: job.exception type=cancel severity=0 interrupted by ctrl-C
2+
28.759s: job.exception type=cancel severity=0 interrupted by ctrl-C
3+
28.880s: job.exception type=cancel severity=0 interrupted by ctrl-C
4+
flux-job: task(s) Terminated
5+
6+
# OSU MPI-CUDA Allreduce Latency Test v5.8
7+
# Size Avg Latency(us)
8+
4 343.81
9+
8 340.60
10+
16 342.57
11+
32 341.13
12+
64 349.80
13+
128 352.47
14+
256 366.56
15+
512 372.22
16+
1024 389.31
17+
2048 433.20
18+
4096 469.32
19+
8192 772.67
20+
16384 1108.57
21+
32768 1423.65
22+
65536 1486.61
23+
131072 1971.11
24+
262144 1310.67
25+
524288 2192.83
26+
1048576 4009.07
27+
START OF JOBSPEC
28+
{"resources": [{"type": "node", "count": 16, "with": [{"type": "slot", "count": 8, "with": [{"type": "core", "count": 1}, {"type": "gpu", "count": 1}], "label": "task"}]}], "tasks": [{"command": ["/opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce", "-d", "cuda", "H", "H"], "slot": "task", "count": {"per_slot": 1}}], "attributes": {"system": {"duration": 0, "cwd": "/home/sochat1_llnl_gov", "shell": {"options": {"rlimit": {"cpu": -1, "fsize": -1, "data": -1, "stack": 8388608, "core": 0, "nofile": 1048576, "as": -1, "rss": -1, "nproc": -1}, "cpu-affinity": "per-task", "gpu-affinity": "per-task"}}}, "user": {"study_id": "osu-allreduce-16-iter-3"}}, "version": 1}
29+
START OF EVENTLOG
30+
{"timestamp":1725922915.0641441,"name":"init"}
31+
{"timestamp":1725922915.0654099,"name":"starting"}
32+
{"timestamp":1725922915.1928067,"name":"shell.init","context":{"service":"501043911-shell-f3LQaYUUF","leader-rank":0,"size":16}}
33+
{"timestamp":1725922915.2170558,"name":"shell.start","context":{"taskmap":{"version":1,"map":[[0,16,8,1]]}}}
34+
{"timestamp":1725922943.6773355,"name":"shell.task-exit","context":{"localid":0,"rank":64,"state":"Exited","pid":1619,"wait_status":15,"signaled":15,"exitcode":143}}
35+
{"timestamp":1725922944.1458526,"name":"complete","context":{"status":36608}}
36+
{"timestamp":1725922944.1458805,"name":"done"}
37+

0 commit comments

Comments
 (0)