converged-computing
diff --git a/‎README.md‎
Lines changed: 2 additions & 6 deletions b/‎README.md‎
Lines changed: 2 additions & 6 deletions
diff --git a/‎experiments/google/compute-engine/gpu/build-images/startup-script.sh‎
Lines changed: 63 additions & 9 deletions b/‎experiments/google/compute-engine/gpu/build-images/startup-script.sh‎
Lines changed: 63 additions & 9 deletions
diff --git a/‎experiments/google/compute-engine/gpu/debug/README.md‎
Lines changed: 6 additions & 0 deletions b/‎experiments/google/compute-engine/gpu/debug/README.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎experiments/google/compute-engine/gpu/size16/README.md‎
Lines changed: 6 additions & 79 deletions b/‎experiments/google/compute-engine/gpu/size16/README.md‎
Lines changed: 6 additions & 79 deletions
diff --git a/‎experiments/google/compute-engine/gpu/size16/results/osu-allreduce/osu-allreduce-16-iter-1-3561500966912.out‎
Lines changed: 36 additions & 0 deletions b/‎experiments/google/compute-engine/gpu/size16/results/osu-allreduce/osu-allreduce-16-iter-1-3561500966912.out‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎experiments/google/compute-engine/gpu/size16/results/osu-allreduce/osu-allreduce-16-iter-2-4692083998720.out‎
Lines changed: 37 additions & 0 deletions b/‎experiments/google/compute-engine/gpu/size16/results/osu-allreduce/osu-allreduce-16-iter-2-4692083998720.out‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎experiments/google/compute-engine/gpu/size16/results/osu-allreduce/osu-allreduce-16-iter-3-5154749284352.out‎
Lines changed: 37 additions & 0 deletions b/‎experiments/google/compute-engine/gpu/size16/results/osu-allreduce/osu-allreduce-16-iter-3-5154749284352.out‎
Lines changed: 37 additions & 0 deletions
@@ -11,10 +11,6 @@ This study will test HPC application performance across three clouds. The reposi
  - [Amazon Web Services](experiments/aws) includes Parallel Cluster (EC2), and EKS (KUbernetes) for each of CPU and GPU
  - [Microsoft Azure](experiments/azure) includes CycleCloud (VMs), and AKS (Kubernetes) for each of CPU and GPU.
 
-## Timing
-
-This is a checklist for the setups we have tested and timed:
-
 ## Experiments
 
 ### "Bare Metal"
@@ -44,14 +40,14 @@ This is a checklist for the setups we have tested and timed:
   - [x] size 64 (vsoch done 8/26/2024)
   - [x] size 128 (vsoch done 8/27/2024)
   - [x] size 256 (vsoch done 8/27/2024)
-- [ ] Google Compute Engine GPU
+- [x] Google Compute Engine GPU
   - done on llnl-flux
   - [x] New VM and automation needed with Terraform (vsoch, early 9/2024)
   - [x] size 4 (vsoch 9/6/2024)
   - [x] size 8 (vsoch 9/7/2024)
   - [x] size 16 (vsoch 9/8/2024)
   - [x] size 32 (vsoch 9/8/2024)
-  - [ ] quicksilver and osu all reduce need runs at all sizes.
+  - [x] quicksilver and osu all reduce need runs at all sizes (vsoch 9/9/2024)
 
 ### Kubernetes
 
 
@@ -104,20 +104,30 @@ cd flux-sched
 ./configure --prefix=/usr --sysconfdir=/etc
 make -j 8 && sudo make install && sudo ldconfig
 
-# install openmpi with cuda
+# Note that UCX (and a rebuild of open mpi) was done after to get OSU/quicksilver working
+
+cd /opt
+sudo git clone https://github.com/openucx/ucx && \
+    sudo chown -R $USER ./ucx && cd ucx/ && \
+    git clean -xfd && \
+    ./autogen.sh && mkdir build && cd build && \
+    ../configure --prefix=/usr --enable-debug --with-cuda=/usr/local/cuda --enable-mt --disable-cma && \
+    make -j && sudo make install
+
+# If already existed - remove
+sudo rm -rf /usr/local/pancakes/
+
+# install openmpi with cuda and ucx
 cd /opt
 sudo mkdir -p /usr/local/pancakes && \
-    sudo wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.2.tar.gz && \
+    sudo wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.2.tar.gz || true && \
     sudo tar -xzvf openmpi-4.1.2.tar.gz && \
     cd openmpi-4.1.2 && \
     sudo chown -R $USER $(pwd) && \
-    ./configure --with-cuda --prefix=/usr/local/pancakes && \
-    make -j 20 && sudo make install
-
-# TODO check these, should be provided in flux environment later
-# ENV CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-# ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/pancakes/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
-# ENV LD_LIBRARY_PATH=/usr/local/pancakes/lib:/opt/miniconda/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
+    make distclean || true && \
+    mkdir build && cd build
+    ../configure --with-cuda=/usr/local/cuda --with-ucx=/usr/ --prefix=/usr/local/pancakes
+    make -j && sudo make install
 
 cd /opt
 
@@ -384,3 +394,47 @@ sudo apt-get install -y --no-install-recommends --allow-change-held-packages apt
 sudo echo "deb https://packages.cloud.google.com/apt google-fast-socket main" | tee /etc/apt/sources.list.d/google-fast-socket.list
 curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add
 sudo apt update && sudo apt install -y --no-install-recommends google-fast-socket=0.0.5
+
+
+# Install additional apps for bare metal, osu and quicksilver and multi-gpu-models
+54  make -j
+   55  make install
+   56  cd /opt/osu-benchmark/
+   57  rm -rf build.openmpi/
+   58  export OSU_VERSION=5.8
+   59  mkdir -p build.openmpi && cd build.openmpi &&     ../src/osu-micro-benchmarks-${OSU_VERSION}/configure CC=mpicc CXX=mpicxx CFLAGS=-I$(pwd)/../src/osu-micro-benchmarks-${OSU_VERSION}/util --prefix=$(pwd) --enable-cuda --with-cuda=/usr/local/cuda &&     make && make install
+   60  export PATH=/usr/local/pancakes/bin:$PATH
+   61  make && make install
+   62  cd /opt/containers/
+   63  cd /root
+
+# OSU benchmarks
+sudo git clone --depth 1 https://github.com/ULHPC/tutorials /opt/tutorials && \
+    sudo mkdir -p /opt/osu-benchmark && \
+    sudo chown -R $USER /opt/tutorials /opt/osu-benchmark && \
+    cd /opt/osu-benchmark && \
+    ln -s /opt/tutorials/parallel/mpi/OSU_MicroBenchmarks ref.d && \
+    ln -s ref.d/Makefile . && \
+    ln -s ref.d/scripts  . && \
+    mkdir src && \
+    cd src && \
+    export OSU_VERSION=5.8 && \
+    wget --no-check-certificate http://mvapich.cse.ohio-state.edu/download/mvapich/osu-micro-benchmarks-${OSU_VERSION}.tgz && \
+    tar xf osu-micro-benchmarks-${OSU_VERSION}.tgz && \
+    cd /opt/osu-benchmark && \
+    # Compile based on openmpi with cuda/ucx
+    mkdir -p build.openmpi && cd build.openmpi && \
+    ../src/osu-micro-benchmarks-${OSU_VERSION}/configure CC=mpicc CXX=mpicxx CFLAGS=-I$(pwd)/../src/osu-micro-benchmarks-${OSU_VERSION}/util --prefix=$(pwd) --enable-cuda --with-cuda=/usr/local/cuda && \
+    make && make install
+    
+# Quicksilver
+sudo git clone https://github.com/LLNL/Quicksilver quicksilver
+sudo chown -R $USER /opt/quicksilver
+wget https://raw.githubusercontent.com/converged-computing/performance-study/main/docker/google/gpu/quicksilver/Makefile
+cd /opt/quicksilver/src
+make || nvcc -DHAVE_CUDA -std=c++11 -O2 -Xptxas -v -gencode=arch=compute_70,code=\"sm_70,compute_70\" --compiler-bindir=/usr/local/pancakes/bin/mpicxx -L/usr/local/cuda/lib64/ -lcuda -lcudart -lm -o qs CollisionEvent.o CoralBenchmark.o CycleTracking.o DecompositionObject.o DirectionCosine.o EnergySpectrum.o GlobalFccGrid.o GridAssignmentObject.o InputBlock.o MCT.o MC_Adjacent_Facet.o MC_Base_Particle.o MC_Domain.o MC_Facet_Crossing_Event.o MC_Fast_Timer.o MC_Load_Particle.o MC_Location.o MC_Particle_Buffer.o MC_RNG_State.o MC_Segment_Outcome.o MC_SourceNow.o MacroscopicCrossSection.o MeshPartition.o MonteCarlo.o MpiCommObject.o NuclearData.o Parameters.o ParticleVault.o ParticleVaultContainer.o PopulationControl.o SendQueue.o SharedMemoryCommObject.o Tallies.o cmdLineParser.o cudaFunctions.o initMC.o main.o parseUtils.o utils.o utilsMpi.o && sudo cp qs /usr/bin/qs
+
+# Multi-gpu-models
+sudo git clone https://github.com/NVIDIA/multi-gpu-programming-models /opt/multi-gpu-programming-models && \
+    cd multi-gpu-programming-models/mpi && \
+    make && sudo mv jacobi /usr/local/bin
@@ -16,6 +16,12 @@ And from the outside and within the container.
 strace -f flux run -opmi=pmix --env OMPI_COMM_WORLD_LOCAL_RANK=0 -N 1 -n 8 -g 1 -o cpu-affinity=per-task -o gpu-affinity=per-task   singularity exec --nv --bind /usr/local/cuda /opt/containers/metric-osu-gpu_google-gpu.sif    /bin/bash -c "/opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce -d cuda H H" 2> flux-singularity-trace-f.txt
 ```
 
+### [flux-singularity-ltrace-f.txt](flux-singularity-ltrace-f.txt)
+
+```bash
+ltrace -f flux run -opmi=pmix --env OMPI_COMM_WORLD_LOCAL_RANK=0 -N 1 -n 8 -g 1 -o cpu-affinity=per-task -o gpu-affinity=per-task   singularity exec --nv --bind /usr/local/cuda /opt/containers/metric-osu-gpu_google-gpu.sif    /bin/bash -c "/opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce -d cuda H H" 2> flux-singularity-ltrace-f.txt
+```
+
 ### [flux-singularity-trace-s-f.txt](flux-singularity-trace-s-f.txt)
 
 ```bash
 
@@ -4,8 +4,6 @@
 - Cluster coming up at 4:35pm Mountain, $320/hour
 - Cluster coming down at 6:00pm
 
-I'm not running OSU all reduce or quicksilver.
-
 ## Experiment
 
 Shell in:
@@ -270,108 +268,37 @@ mkdir -p $output
 # We should do H H - better values across the board
 ./flux-run-combinations.sh 16 $app
 
-# D D and H H errors (bad results but we ran anyway):
-# The call to cuMemHostRegister(0x78407fe00008, 134217728, 0) failed.
-#  Host:  flux-004
-#  cuMemHostRegister return value:  1
-#  Registration cache:  smcuda
-
-# Note that osu_latency had worse values with D D. H H seems better across the board.
-cho "Running iteration $i"
-
-# -d cuda H H/D D slowest and has errors for allreduce
-
 # These were run separately
 export app=osu-allreduce
 export output=results/$app
 mkdir -p $output
 
-# I skipped these for now because we need to debug the GPU issue, don't
-# want to spend the money credits on crappy results
-# confirmed using all 8 gpu, but just a little, mostly memory (~312MiB)
-for i in $(seq 2 2); do
-
-# original command for 4, 2m 36 seconds  
-time flux run -opmi=pmix -N 8 -n 64 -g 1 -o cpu-affinity=per-task -o gpu-affinity=per-task \
-  --setattr=user.study_id=$app-4-DD-iter-$i \
-  singularity exec --nv --bind /usr/local/cuda /opt/containers/metric-osu-gpu_google-gpu.sif  \
-  bash -c "ulimit -m 9999999999 ; /opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce -d cuda D D"
-
-# 2m 41 seconds
-time flux run -opmi=pmix -N 4 -n 32 -g 1 --setattr=user.study_id=$app-4-HH-iter-$i  -o cpu-affinity=per-task -o gpu-affinity=per-task \
-  singularity exec --nv --bind /usr/local/cuda /opt/containers/metric-osu-gpu_google-gpu.sif  \
-  /opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce -d cuda H H
-
-# 2m 19 seconds
-time flux run -opmi=pmix -N 4 -n 32 -g 1 --setattr=user.study_id=$app-4-iter-$i -o cpu-affinity=per-task -o gpu-affinity=per-task \
-  singularity exec --nv --bind /usr/local/cuda /opt/containers/metric-osu-gpu_google-gpu.sif  \
-  /opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce
+# fastest with D D and the OMPI envar.
+for i in $(seq 1 5); do
+   time flux run --env OMPI_COMM_WORLD_LOCAL_RANK=0 --env OMPI_MCA_pml=ucx --env OMPI_MCA_btl=tcp -N 16 -n 128 -g 1 --setattr=user.study_id=$app-16-iter-$i -o cpu-affinity=per-task -o gpu-affinity=per-task /opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce -d cuda H H
 done
 
-# Not tested yet! There are still errors (and much slower times) with any cuda flags
-sflux run --setattr=user.study_id=$app-8-iter-$i -N 8 -n 64 -g 1 -o cpu-affinity=per-task -o gpu-affinity=per-task \
-singularity exec --nv /opt/containers/metric-osu-gpu_google-gpu.sif \
-/opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce
-
-flux run --setattr=user.study_id=$app-16-iter-$i -N 16 -n 128 -g 1 -o cpu-affinity=per-task -o gpu-affinity=per-task \
-singularity exec --nv /opt/containers/metric-osu-gpu_google-gpu.sif \
-/opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce
-
-flux run --setattr=user.study_id=$app-32-iter-$i -N 32 -n 256 -g 1 -o cpu-affinity=per-task -o gpu-affinity=per-task \
-singularity exec --nv /opt/containers/metric-osu-gpu_google-gpu.sif \
-/opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce
-
 # When they are done:
 ./save.sh $output
 oras push ghcr.io/converged-computing/metrics-operator-experiments/performance:compute-engine-gpu-16-$app $output
 ```
 
 #### Quicksilver
 
-Testing:
-
-```console
-# This is the only app that didn't run (I tried a lot of different configs)
-# The call to cuMemHostRegister(0x7fbb82200008, 134217728, 0) failed.
-#  Host:  flux-004
-#  cuMemHostRegister return value:  1
-#  Registration cache:  smcuda
-
-# testing smcuda snake error
-flux run -opmi=pmix -o gpu-affinity=per-task --env OMP_NUM_THREADS=1 -o cpu-affinity=per-task -N2 -n 16 -g 1 singularity exec --nv /opt/containers/metric-quicksilver-gpu_google-gpu.sif qs --inputFile /opt/quicksilver/Examples/CORAL2_Benchmark/Problem1/Coral2_P1.inp -X 32 -Y 32 -Z 16 -x 32 -y 32 -z 16 -I 4 -J 2 -K 2 -n 26214400
-
-# only works on one node, ssh is not allowed
-mpirun -n 8 --map-by ppr:8:node singularity exec --nv /opt/containers/metric-quicksilver-gpu_google-gpu.sif qs --inputFile /opt/quicksilver/Examples/CORAL2_Benchmark/Problem1/Coral2_P1.inp -X 16 -Y 16 -Z 16 -x 16 -y 16 -z 16 -I 4 -J 4 -K 2 -n 163840
-
-time flux run -o gpu-affinity=per-task -o cpu-affinity=per-task --exclusive --env OMP_NUM_THREADS=1 -N2 -n 16 -g 1 singularity exec --nv /opt/containers/metric-quicksilver-gpu_google-gpu.sif qs --inputFile /opt/quicksilver/Examples/CORAL2_Benchmark/Problem1/Coral2_P1.inp -X 32 -Y 32 -Z 16 -x 32 -y 32 -z 16 -I 4 -J 4 -K 2 -n 26214400
-```
-
-Run attempt:
-
 ```console
 export app=quicksilver
 export output=results/$app
 mkdir -p $output
 
-# Error:
-# --------------------------------------------------------------------------
-# The call to cuMemHostRegister(0x7e21e1c00008, 134217728, 0) failed.
-#  Host:  flux-001
-#  cuMemHostRegister return value:  1
-#  Registration cache:  smcuda
-# --------------------------------------------------------------------------
-
-# confirmed using all 8 GPU, 100%, despite error above
-# Allowing 10 minutes to see output, and if none, cancelling.
+# confirmed using all 8 GPU, 100%
+# Allowing 15 minutes to run then cancel
 for i in $(seq 1 1); do
     echo "Running iteration $i"
-    # Try this and see if completes
-    time flux run -o gpu-affinity=per-task -o cpu-affinity=per-task --exclusive --env OMP_NUM_THREADS=1 --setattr=user.study_id=$app-4-iter-$i -N4 -n 32 -g 1 singularity exec --nv /opt/containers/metric-quicksilver-gpu_google-gpu.sif qs --inputFile /opt/quicksilver/Examples/CORAL2_Benchmark/Problem1/Coral2_P1.inp -X 32 -Y 32 -Z 32 -x 32 -y 32 -z 32 -I 4 -J 4 -K 2 -n 52428800
+   time flux run --exclusive --env OMP_NUM_THREADS=1 --env OMPI_MCA_pml=ucx --env OMPI_MCA_btl=tcp -N 16 -n 128 -g 1 --setattr=user.study_id=$app-16-iter-$i -o cpu-affinity=per-task -o gpu-affinity=per-task qs --inputFile /opt/quicksilver/Examples/CORAL2_Benchmark/Problem1/Coral2_P1.inp -X 64  -Y 64  -Z 32  -x 64  -y 64  -z 32  -I 8  -J 4  -K 4  -n 209715200
 done
 
 # When they are done:
 ./save.sh $output
-
 oras push ghcr.io/converged-computing/metrics-operator-experiments/performance:compute-engine-gpu-16-$app $output
 ```
 
 
@@ -0,0 +1,36 @@
+58.145s: job.exception type=cancel severity=0 interrupted by ctrl-C
+58.552s: job.exception type=cancel severity=0 interrupted by ctrl-C
+flux-job: task(s) Terminated
+
+# OSU MPI-CUDA Allreduce Latency Test v5.8
+# Size       Avg Latency(us)
+4                     337.07
+8                     333.06
+16                    334.79
+32                    340.88
+64                    333.40
+128                   341.31
+256                   343.44
+512                   356.03
+1024                  373.93
+2048                  411.85
+4096                  451.33
+8192                  746.66
+16384                1038.96
+32768                1323.51
+65536                1420.76
+131072               2002.24
+262144               1318.68
+524288               2188.49
+1048576              3982.10
+START OF JOBSPEC
+{"resources": [{"type": "node", "count": 16, "with": [{"type": "slot", "count": 8, "with": [{"type": "core", "count": 1}, {"type": "gpu", "count": 1}], "label": "task"}]}], "tasks": [{"command": ["/opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce", "-d", "cuda", "H", "H"], "slot": "task", "count": {"per_slot": 1}}], "attributes": {"system": {"duration": 0, "cwd": "/home/sochat1_llnl_gov", "shell": {"options": {"rlimit": {"cpu": -1, "fsize": -1, "data": -1, "stack": 8388608, "core": 0, "nofile": 1048576, "as": -1, "rss": -1, "nproc": -1}, "cpu-affinity": "per-task", "gpu-affinity": "per-task"}}}, "user": {"study_id": "osu-allreduce-16-iter-1"}}, "version": 1}
+START OF EVENTLOG
+{"timestamp":1725922820.0990932,"name":"init"}
+{"timestamp":1725922820.1000268,"name":"starting"}
+{"timestamp":1725922820.779249,"name":"shell.init","context":{"service":"501043911-shell-f2cZAePdh","leader-rank":0,"size":16}}
+{"timestamp":1725922820.9687486,"name":"shell.start","context":{"taskmap":{"version":1,"map":[[0,16,8,1]]}}}
+{"timestamp":1725922878.296582,"name":"shell.task-exit","context":{"localid":1,"rank":65,"state":"Exited","pid":1487,"wait_status":15,"signaled":15,"exitcode":143}}
+{"timestamp":1725922878.8372781,"name":"complete","context":{"status":36608}}
+{"timestamp":1725922878.837321,"name":"done"}
+
@@ -0,0 +1,37 @@
+23.953s: job.exception type=cancel severity=0 interrupted by ctrl-C
+24.155s: job.exception type=cancel severity=0 interrupted by ctrl-C
+24.348s: job.exception type=cancel severity=0 interrupted by ctrl-C
+flux-job: task(s) Terminated
+
+# OSU MPI-CUDA Allreduce Latency Test v5.8
+# Size       Avg Latency(us)
+4                     346.93
+8                     343.36
+16                    345.98
+32                    345.99
+64                    345.13
+128                   343.23
+256                   359.02
+512                   366.02
+1024                  379.76
+2048                  423.79
+4096                  467.01
+8192                  738.21
+16384                1113.73
+32768                1372.31
+65536                1449.55
+131072               2002.28
+262144               1318.81
+524288               2209.33
+1048576              4025.37
+START OF JOBSPEC
+{"resources": [{"type": "node", "count": 16, "with": [{"type": "slot", "count": 8, "with": [{"type": "core", "count": 1}, {"type": "gpu", "count": 1}], "label": "task"}]}], "tasks": [{"command": ["/opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce", "-d", "cuda", "H", "H"], "slot": "task", "count": {"per_slot": 1}}], "attributes": {"system": {"duration": 0, "cwd": "/home/sochat1_llnl_gov", "shell": {"options": {"rlimit": {"cpu": -1, "fsize": -1, "data": -1, "stack": 8388608, "core": 0, "nofile": 1048576, "as": -1, "rss": -1, "nproc": -1}, "cpu-affinity": "per-task", "gpu-affinity": "per-task"}}}, "user": {"study_id": "osu-allreduce-16-iter-2"}}, "version": 1}
+START OF EVENTLOG
+{"timestamp":1725922887.4867351,"name":"init"}
+{"timestamp":1725922887.4880421,"name":"starting"}
+{"timestamp":1725922887.6449196,"name":"shell.init","context":{"service":"501043911-shell-f38FgPx4w","leader-rank":0,"size":16}}
+{"timestamp":1725922887.6735122,"name":"shell.start","context":{"taskmap":{"version":1,"map":[[0,16,8,1]]}}}
+{"timestamp":1725922911.4788885,"name":"shell.task-exit","context":{"localid":0,"rank":16,"state":"Exited","pid":1540,"wait_status":15,"signaled":15,"exitcode":143}}
+{"timestamp":1725922911.8023748,"name":"complete","context":{"status":36608}}
+{"timestamp":1725922911.8023956,"name":"done"}
+
@@ -0,0 +1,37 @@
+28.598s: job.exception type=cancel severity=0 interrupted by ctrl-C
+28.759s: job.exception type=cancel severity=0 interrupted by ctrl-C
+28.880s: job.exception type=cancel severity=0 interrupted by ctrl-C
+flux-job: task(s) Terminated
+
+# OSU MPI-CUDA Allreduce Latency Test v5.8
+# Size       Avg Latency(us)
+4                     343.81
+8                     340.60
+16                    342.57
+32                    341.13
+64                    349.80
+128                   352.47
+256                   366.56
+512                   372.22
+1024                  389.31
+2048                  433.20
+4096                  469.32
+8192                  772.67
+16384                1108.57
+32768                1423.65
+65536                1486.61
+131072               1971.11
+262144               1310.67
+524288               2192.83
+1048576              4009.07
+START OF JOBSPEC
+{"resources": [{"type": "node", "count": 16, "with": [{"type": "slot", "count": 8, "with": [{"type": "core", "count": 1}, {"type": "gpu", "count": 1}], "label": "task"}]}], "tasks": [{"command": ["/opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce", "-d", "cuda", "H", "H"], "slot": "task", "count": {"per_slot": 1}}], "attributes": {"system": {"duration": 0, "cwd": "/home/sochat1_llnl_gov", "shell": {"options": {"rlimit": {"cpu": -1, "fsize": -1, "data": -1, "stack": 8388608, "core": 0, "nofile": 1048576, "as": -1, "rss": -1, "nproc": -1}, "cpu-affinity": "per-task", "gpu-affinity": "per-task"}}}, "user": {"study_id": "osu-allreduce-16-iter-3"}}, "version": 1}
+START OF EVENTLOG
+{"timestamp":1725922915.0641441,"name":"init"}
+{"timestamp":1725922915.0654099,"name":"starting"}
+{"timestamp":1725922915.1928067,"name":"shell.init","context":{"service":"501043911-shell-f3LQaYUUF","leader-rank":0,"size":16}}
+{"timestamp":1725922915.2170558,"name":"shell.start","context":{"taskmap":{"version":1,"map":[[0,16,8,1]]}}}
+{"timestamp":1725922943.6773355,"name":"shell.task-exit","context":{"localid":0,"rank":64,"state":"Exited","pid":1619,"wait_status":15,"signaled":15,"exitcode":143}}
+{"timestamp":1725922944.1458526,"name":"complete","context":{"status":36608}}
+{"timestamp":1725922944.1458805,"name":"done"}
+