[Fix] Fix race condition in flashinfer backend (#103)

DarkSharpness · web-flow · commit 20fcd7f3c6a1 · 2026-03-13T14:32:39.000+08:00
diff --git a/python/minisgl/attention/fi.py b/python/minisgl/attention/fi.py
@@ -117,15 +117,19 @@ def __init__(self, config: ModelConfig) -> None:
         self.max_graph_bs = 0
         self.graph_wrappers: Dict[int, CUDAGraphBatchDecodeWithPagedKVCacheWrapper] = {}
         self.capture: FICaptureData | None = None
+        self.last_event = torch.cuda.Event()
+        self.last_event.record()
 
-    @staticmethod
-    def _initialize_metadata_once(metadata: FIMetadata) -> None:
+    def _initialize_metadata_once(self, metadata: FIMetadata) -> None:
         if metadata.initialized:
             return
 
         from flashinfer import BatchDecodeWithPagedKVCacheWrapper
 
         metadata.initialized = True
+        # FlashInfer planning reuses a pinned host staging buffer and launches an
+        # async H2D copy. Wait here before the next plan mutates that host buffer.
+        self.last_event.synchronize()
         if isinstance(metadata.wrapper, BatchDecodeWithPagedKVCacheWrapper):
             metadata.wrapper.plan(
                 indptr=metadata.cu_seqlens_k_cpu,
@@ -159,6 +163,7 @@ def _initialize_metadata_once(metadata: FIMetadata) -> None:
                 non_blocking=True,
                 causal=True,
             )
+        self.last_event.record()
 
     def _get_ones_cpu(self, bs: int) -> torch.Tensor:
         if bs <= len(self.cached_ones_cpu):
diff --git a/python/minisgl/env.py b/python/minisgl/env.py
@@ -67,7 +67,6 @@ class EnvClassSingleton:
     # backend runtime
     FLASHINFER_USE_TENSOR_CORES = EnvOption()
     DISABLE_OVERLAP_SCHEDULING = EnvBool(False)
-    OVERLAP_EXTRA_SYNC = EnvBool(False)
     PYNCCL_MAX_BUFFER_SIZE = EnvMem(1024**3)
 
     def __new__(cls):
diff --git a/python/minisgl/scheduler/scheduler.py b/python/minisgl/scheduler/scheduler.py
@@ -227,8 +227,6 @@ def _schedule_next_batch(self) -> ForwardInput | None:
     def _forward(self, forward_input: ForwardInput) -> ForwardOutput:
         batch, sample_args, input_mapping, output_mapping = forward_input
         batch.input_ids = self.token_pool[input_mapping]
-        if ENV.OVERLAP_EXTRA_SYNC:  # NOTE: https://github.com/sgl-project/mini-sglang/issues/58
-            self.stream.synchronize()
         forward_output = self.engine.forward_batch(batch, sample_args)
         self.token_pool[output_mapping] = forward_output.next_tokens_gpu
         self.decode_manager.filter_reqs(forward_input.batch.reqs)