diff --git a/.github/workflows/build-mac-arm64.yml b/.github/workflows/build-mac-arm64.yml
index dc387a7011..a77775a170 100644
--- a/.github/workflows/build-mac-arm64.yml
+++ b/.github/workflows/build-mac-arm64.yml
@@ -93,7 +93,17 @@ jobs:
       
     - name: make test
       shell: bash -l {0}
-      run: conda activate shapeworks && source ./devenv.sh ./build/bin && cd build && ctest -VV
+      run: |
+        conda activate shapeworks && source ./devenv.sh ./build/bin && cd build
+        # Run ctest; if it fails, re-run failed tests under lldb to capture stack traces
+        if ! ctest -VV; then
+          echo "=== Tests failed, re-running failures under lldb for stack traces ==="
+          for exe in $(ctest --rerun-failed -N 2>&1 | grep "^  Test" | awk '{print $NF}'); do
+            echo "=== Running bin/$exe under lldb ==="
+            lldb -b -o run -k "thread backtrace all" -k "quit 1" -- bin/$exe 2>&1 || true
+          done
+          exit 1
+        fi
 
     - uses: actions/upload-artifact@v4
       with:
diff --git a/Libs/Optimize/GradientDescentOptimizer.cpp b/Libs/Optimize/GradientDescentOptimizer.cpp
index e195089856..26cf240573 100644
--- a/Libs/Optimize/GradientDescentOptimizer.cpp
+++ b/Libs/Optimize/GradientDescentOptimizer.cpp
@@ -7,6 +7,7 @@
 #include <tbb/parallel_for.h>
 #include <time.h>
 
+#include <atomic>
 #include <algorithm>
 #include <chrono>
 #include <ctime>
@@ -130,10 +131,19 @@ void GradientDescentOptimizer::start_adaptive_gauss_seidel_optimization() {
     counter++;
 
     TIME_START("parallel_sampling");
+    // Suppress observer events during parallel section to avoid data races.
+    // Observers (ShapeMatrix, ShapeGradientMatrix, etc.) will be batch-updated
+    // via SynchronizePositions() after the parallel section completes.
+    particle_system_->SetEventsEnabled(false);
+
+    // Thread-safe max tracking: use atomic compare-exchange
+    std::atomic<double> atomic_max_change{0.0};
+
     // Iterate over each domain
     const auto domains_per_shape = particle_system_->GetDomainsPerShape();
     tbb::parallel_for(
         tbb::blocked_range<size_t>{0, num_domains / domains_per_shape}, [&](const tbb::blocked_range<size_t>& r) {
+          double local_max_change = 0.0;  // per-task max to minimize atomic contention
           for (size_t shape = r.begin(); shape < r.end(); ++shape) {
             for (int shape_dom_idx = 0; shape_dom_idx < domains_per_shape; shape_dom_idx++) {
               auto dom = shape * domains_per_shape + shape_dom_idx;
@@ -206,8 +216,8 @@ void GradientDescentOptimizer::start_adaptive_gauss_seidel_optimization() {
                   if (new_energy < energy) {
                     // good move, increase timestep for next time
                     time_steps_[dom][k] *= factor;
-                    if (grad_mag > max_change) {
-                      max_change = grad_mag;
+                    if (grad_mag > local_max_change) {
+                      local_max_change = grad_mag;
                     }
                     break;
                   } else {  // bad move, reset point position and back off on timestep
@@ -219,8 +229,8 @@ void GradientDescentOptimizer::start_adaptive_gauss_seidel_optimization() {
                       time_steps_[dom][k] /= factor;
                     } else {
                       // keep the move with timestep 1.0 anyway
-                      if (grad_mag > max_change) {
-                        max_change = grad_mag;
+                      if (grad_mag > local_max_change) {
+                        local_max_change = grad_mag;
                       }
                       break;
                     }
@@ -229,9 +239,21 @@ void GradientDescentOptimizer::start_adaptive_gauss_seidel_optimization() {
               }    // for each particle
             }
           }  // for each domain
+          // Update global max atomically
+          double prev = atomic_max_change.load(std::memory_order_relaxed);
+          while (local_max_change > prev &&
+                 !atomic_max_change.compare_exchange_weak(prev, local_max_change, std::memory_order_relaxed)) {
+          }
         });
 
+    max_change = atomic_max_change.load();
+
     TIME_STOP("parallel_sampling");
+
+    // Re-enable events and batch-update all observers with final positions.
+    particle_system_->SetEventsEnabled(true);
+    particle_system_->SynchronizePositions();
+
     number_of_iterations_++;
     gradient_function_->after_iteration();
 
diff --git a/Libs/Optimize/ParticleSystem.cpp b/Libs/Optimize/ParticleSystem.cpp
index 9cc52d542f..24c9552be6 100644
--- a/Libs/Optimize/ParticleSystem.cpp
+++ b/Libs/Optimize/ParticleSystem.cpp
@@ -167,12 +167,13 @@ const ParticleSystem::PointType& ParticleSystem::SetPosition(const PointType& p,
     }
   }
 
-  // Notify any observers.
-  ParticlePositionSetEvent e;
-  e.SetDomainIndex(d);
-  e.SetPositionIndex(k);
-
-  this->InvokeEvent(e);
+  // Notify any observers (skipped when events are disabled for thread safety).
+  if (m_EventsEnabled) {
+    ParticlePositionSetEvent e;
+    e.SetDomainIndex(d);
+    e.SetPositionIndex(k);
+    this->InvokeEvent(e);
+  }
 
   return m_Positions[d]->operator[](k);
 }
diff --git a/Libs/Optimize/ParticleSystem.h b/Libs/Optimize/ParticleSystem.h
index 02017877d2..b4d95471da 100644
--- a/Libs/Optimize/ParticleSystem.h
+++ b/Libs/Optimize/ParticleSystem.h
@@ -379,6 +379,12 @@ class ParticleSystem : public itk::DataObject {
   }
   unsigned int GetDomainsPerShape() const { return m_DomainsPerShape; }
 
+  /** Suppress/resume observer event notifications.  When disabled, SetPosition()
+      still updates particle positions but does not fire InvokeEvent.  Call
+      SynchronizePositions() after re-enabling to update all observers. */
+  void SetEventsEnabled(bool enabled) { m_EventsEnabled = enabled; }
+  bool GetEventsEnabled() const { return m_EventsEnabled; }
+
   /** Set the number of domains.  This method modifies the size of the
     m_Domains, m_Positions, and m_Transform lists. */
   void SetNumberOfDomains(unsigned int);
@@ -451,6 +457,9 @@ class ParticleSystem : public itk::DataObject {
 
   std::vector<std::string> m_FieldAttributes;
 
+  /** When false, SetPosition skips InvokeEvent for thread safety. */
+  bool m_EventsEnabled{true};
+
   std::mt19937 m_rand{42};
 };