Fix data race on should_stop_ flag in LLM runner (#18652)

kirklandsign · web-flow · commit d7c62645e096 · 2026-04-13T17:04:56.000-07:00
should_stop_ is written from the caller thread via stop() and read from
the inference thread in the generate loop. A plain bool without
synchronization is undefined behavior per the C++ standard and can cause
the compiler to optimize away the cross-thread visibility on ARM
targets.

Change bool to std::atomic&lt;bool&gt; with relaxed memory ordering, which is
sufficient for a simple cancellation flag and has negligible overhead.
diff --git a/extension/llm/runner/text_decoder_runner.h b/extension/llm/runner/text_decoder_runner.h
@@ -69,10 +69,6 @@ class ET_EXPERIMENTAL TextDecoderRunner {
     return method_name_;
   }
 
-  inline void stop() {
-    should_stop_ = true;
-  }
-
   /**
    * Sample the next token from the logits tensor.
    * @param logits_tensor The logits tensor.
@@ -98,7 +94,6 @@ class ET_EXPERIMENTAL TextDecoderRunner {
   Module* module_;
   IOManager* io_manager_;
   std::string method_name_;
-  bool should_stop_{false};
 };
 
 } // namespace llm
diff --git a/extension/llm/runner/text_llm_runner.cpp b/extension/llm/runner/text_llm_runner.cpp
@@ -108,7 +108,6 @@ Error TextLLMRunner::generate(
   // return a response token.
 
   stats_->inference_start_ms = time_in_ms();
-  shouldStop_ = false;
 
   int64_t max_context_len = metadata_.at(kMaxContextLen);
 
diff --git a/extension/llm/runner/text_llm_runner.h b/extension/llm/runner/text_llm_runner.h
@@ -161,8 +161,6 @@ class ET_EXPERIMENTAL TextLLMRunner : public IRunner {
   void stop() override;
 
  private:
-  bool shouldStop_{false};
-
   // Components
   std::unique_ptr<::tokenizers::Tokenizer> tokenizer_;
   std::unordered_map<std::string, int64_t> metadata_;
diff --git a/extension/llm/runner/text_token_generator.h b/extension/llm/runner/text_token_generator.h
@@ -9,6 +9,8 @@
 // Generate tokens in a loop.
 #pragma once
 
+#include <atomic>
+
 #include <executorch/extension/llm/runner/stats.h>
 #include <executorch/extension/llm/runner/text_decoder_runner.h>
 #include <executorch/extension/tensor/tensor.h>
@@ -95,7 +97,7 @@ class ET_EXPERIMENTAL TextTokenGenerator {
           resize_tensor_ptr(tokens_managed, token_shape));
     }
 
-    should_stop_ = false;
+    should_stop_.store(false, std::memory_order_relaxed);
 
     // Generate our tokens
     while (pos < start_pos + max_new_tokens) {
@@ -136,7 +138,7 @@ class ET_EXPERIMENTAL TextTokenGenerator {
       }
       token_callback(std::move(*decode_result));
 
-      if (should_stop_) {
+      if (should_stop_.load(std::memory_order_relaxed)) {
         break;
       }
 
@@ -154,7 +156,7 @@ class ET_EXPERIMENTAL TextTokenGenerator {
    * Stop the generation loop.
    */
   inline void stop() {
-    should_stop_ = true;
+    should_stop_.store(true, std::memory_order_relaxed);
   }
 
   /**
@@ -188,7 +190,7 @@ class ET_EXPERIMENTAL TextTokenGenerator {
   bool ignore_eos_ = false;
 
   // state machine
-  bool should_stop_ = false;
+  std::atomic<bool> should_stop_{false};
 
   // stats
   Stats* stats_;