fix: clean up multi-turn dead code and address PR mlcommons#285 reviewer comments

tianmu-li · claude · tianmu-li · commit d8dfd64d6f71 · 2026-04-22T14:39:48.000-07:00
- Remove dead constant BLOCK_ON_PREVIOUS_TURN = -1 from scheduler.py
- Remove redundant outer with state.condition: in mark_turn_complete
- Remove ConversationMode import and explicit mode= args from integration tests
- Fix format: jsonl → format: ".jsonl" in example YAMLs and docs
- Add target_concurrency: 1 clarification to quickstart (preserves turn ordering)
- Remove broken HYBRID_SCHEDULER_GUIDE.md reference from quickstart

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/docs/MULTI_TURN_QUICKSTART.md b/docs/MULTI_TURN_QUICKSTART.md
@@ -40,7 +40,7 @@ datasets:
   - name: my_conversations
     type: performance
     path: path/to/your/conversations.jsonl
-    format: jsonl
+    format: ".jsonl"
     multi_turn: # ← Presence of this block enables multi-turn mode
       mode: independent # ← Per-conv pipelines; no cross-conv turn barrier
       turn_timeout_s: 300 # ← Max wait for prev turn
@@ -125,6 +125,8 @@ mode: independent
 
 **Use for**: Realistic production load where short conversations finish while long ones are still running.
 For single-conversation debugging, use `mode: independent` with `target_concurrency: 1`.
+Note: unlike the plain `ConcurrencyScheduler`, multi-turn + `target_concurrency: 1` still enforces
+per-conversation turn ordering — turn N+1 waits for turn N even at concurrency 1.
 
 **Example timeline**:
 
@@ -136,21 +138,6 @@ t=0.8:  conv1-turn3 (after conv1-turn2 completes)
 ...
 ```
 
-### Poisson Mode (Planned)
-
-```yaml
-mode: poisson
-conversations_per_second: 10.0
-```
-
-**Behavior**:
-
-- Start conversations with Poisson arrival
-- Sequence turns within each
-- Realistic user arrival patterns
-
-**Status**: Not yet implemented (falls back to `independent`)
-
 ---
 
 ## 🎛️ Concurrency Control (NEW!)
@@ -172,8 +159,6 @@ settings:
 - Medium (50-500 convs): `target_concurrency: 32`
 - Large (500+ convs): `target_concurrency: 64`
 
-**See**: `HYBRID_SCHEDULER_GUIDE.md` for detailed guide
-
 ---
 
 ## 🔧 Common Configurations
@@ -245,12 +230,12 @@ multi_turn:
 
 **Problem**: MultiTurnDataset not recognized.
 
-**Fix**: Ensure `format: jsonl` is specified in config:
+**Fix**: Ensure `format: ".jsonl"` is specified in config:
 
 ```yaml
 datasets:
   - path: your_file.jsonl
-    format: jsonl # ← Required for JSONL
+    format: ".jsonl" # ← Required for JSONL
 ```
 
 ---
@@ -353,7 +338,7 @@ Before running your first multi-turn benchmark:
 - [ ] Config has `multi_turn:` block in the dataset section
 - [ ] Config has `load_pattern.type: multi_turn`
 - [ ] Endpoint is running and reachable
-- [ ] `format: jsonl` specified for JSONL datasets
+- [ ] `format: ".jsonl"` specified for JSONL datasets
 - [ ] Conversation IDs are unique per conversation
 - [ ] Turn numbers are sequential (1, 2, 3, ...)
 
diff --git a/examples/09_MultiTurn/README.md b/examples/09_MultiTurn/README.md
@@ -139,7 +139,7 @@ datasets:
   - name: customer_support
     type: performance
     path: examples/multi_turn/customer_support_conversations.jsonl
-    format: jsonl
+    format: ".jsonl"
     multi_turn:
       mode: independent
       turn_timeout_s: 300.0
@@ -204,18 +204,6 @@ still running. Turn 1 of one conversation and turn 100 of another can be in-flig
 
 For single-conversation debugging, use `mode: independent` with `target_concurrency: 1`.
 
-#### Poisson Mode
-
-Starts conversations with Poisson arrival, sequences turns within each conversation.
-
-```yaml
-multi_turn:
-  mode: poisson
-  conversations_per_second: 10.0
-```
-
-**Use case**: Realistic arrival patterns (not yet implemented; falls back to `independent`)
-
 ### Turn Timeout
 
 Configure maximum wait time for previous turn completion:
diff --git a/examples/09_MultiTurn/multi_turn_benchmark.yaml b/examples/09_MultiTurn/multi_turn_benchmark.yaml
@@ -11,7 +11,7 @@ datasets:
   - name: customer_support_conversations
     type: performance
     path: examples/09_MultiTurn/customer_support_conversations.jsonl
-    format: jsonl
+    format: ".jsonl"
     samples: 10
     multi_turn:
       mode: independent
diff --git a/examples/09_MultiTurn/multi_turn_with_concurrency.yaml b/examples/09_MultiTurn/multi_turn_with_concurrency.yaml
@@ -11,7 +11,7 @@ datasets:
   - name: customer_support_conversations
     type: performance
     path: examples/09_MultiTurn/customer_support_conversations.jsonl
-    format: jsonl
+    format: ".jsonl"
     samples: 10
     multi_turn:
       mode: independent # All conv turn-1 start together
diff --git a/src/inference_endpoint/config/schema.py b/src/inference_endpoint/config/schema.py
@@ -68,7 +68,6 @@ class LoadPatternType(str, Enum):
 class ConversationMode(str, Enum):
     """Multi-turn conversation scheduling modes."""
 
-    POISSON = "poisson"  # Poisson conv arrival, sequence turns within
     INDEPENDENT = "independent"  # Per-conv pipelines; no cross-conv turn barrier
 
 
@@ -246,16 +245,14 @@ class MultiTurnConfig(BaseModel):
     Presence of this block in the dataset config enables multi-turn mode.
 
     Attributes:
-        mode: Conversation scheduling strategy (independent or poisson).
+        mode: Conversation scheduling strategy (currently only independent).
         turn_timeout_s: Maximum seconds to wait for previous turn completion.
-        conversations_per_second: Target CPS for POISSON mode (None = use dataset order).
     """
 
     model_config = {"extra": "forbid"}
 
     mode: ConversationMode = ConversationMode.INDEPENDENT
     turn_timeout_s: float = 300.0
-    conversations_per_second: float | None = None  # For POISSON mode
     use_dataset_history: bool = True
 
 
diff --git a/src/inference_endpoint/load_generator/conversation_manager.py b/src/inference_endpoint/load_generator/conversation_manager.py
@@ -39,7 +39,6 @@ class ConversationState:
         conversation_id: Unique identifier for this conversation.
         current_turn: Last completed turn number (0 = not started).
         pending_client_turn: Turn number of in-flight client turn (None if idle).
-        turn_complete_event: Threading event to signal turn completion.
         expected_client_turns: Expected number of client turns (for completion tracking).
         issued_client_turns: Count of client turns issued.
         completed_client_turns: Count of client turns with responses.
@@ -54,7 +53,6 @@ class ConversationState:
     conversation_id: str
     current_turn: int = 0
     pending_client_turn: int | None = None
-    turn_complete_event: threading.Event = field(default_factory=threading.Event)
     expected_client_turns: int | None = None
     issued_client_turns: int = 0
     completed_client_turns: int = 0
@@ -106,8 +104,6 @@ def add_assistant_turn(self, content: str | None = None):
                 self.completed_client_turns += 1
             self.condition.notify_all()
 
-        self.turn_complete_event.set()
-
         if self.is_complete():
             if self.failed_client_turns > 0:
                 logger.info(
@@ -145,8 +141,6 @@ def mark_turn_failed(self):
                 )
             self.condition.notify_all()
 
-        self.turn_complete_event.set()
-
         if self.is_complete():
             logger.info(
                 f"Conversation {self.conversation_id} completed with failures: "
@@ -165,14 +159,11 @@ def is_complete(self) -> bool:
             return False  # Unknown completion, can't determine
         return self.completed_client_turns >= self.expected_client_turns
 
-    def is_ready_for_turn(self, turn: int) -> bool:
-        """Check if ready to issue this turn (previous turn must be complete).
-
-        Args:
-            turn: Turn number to check (unused; sequencing is based on completion counts).
+    def is_ready_for_turn(self) -> bool:
+        """Check if the previous turn has completed and the next may be issued.
 
         Returns:
-            True if ready to issue this turn, False otherwise.
+            True if ready to issue the next turn, False otherwise.
         """
         return (
             self.pending_client_turn is None
@@ -195,17 +186,14 @@ class ConversationManager:
     Each ConversationState carries its own Condition so that state changes
     (turn issued / turn complete) only wake the single pipeline thread waiting
     on that conversation, not all pipeline threads across all conversations.
-    A separate _created_condition handles the narrow case where a pipeline
-    thread calls wait_for_turn_issued before get_or_create has run.
+    All conversation states are pre-created by the scheduler before pipeline
+    threads start, so wait_for_turn_issued never races against get_or_create.
     """
 
     def __init__(self):
         """Initialize conversation manager with empty state."""
         self._conversations: dict[str, ConversationState] = {}
         self._lock = threading.Lock()
-        # Fired whenever a new conversation state is registered.
-        # Only needed by wait_for_turn_issued when state is None on entry.
-        self._created_condition = threading.Condition(self._lock)
 
     def get_or_create(
         self,
@@ -233,15 +221,13 @@ def get_or_create(
                     conversation_id=conversation_id,
                     current_turn=0,
                     pending_client_turn=None,
-                    turn_complete_event=threading.Event(),
                     expected_client_turns=expected_client_turns,
                     issued_client_turns=0,
                     completed_client_turns=0,
                     failed_client_turns=0,
                     message_history=initial_history,
                 )
                 self._conversations[conversation_id] = state
-                self._created_condition.notify_all()
             return self._conversations[conversation_id]
 
     def wait_for_turn_ready(
@@ -272,11 +258,11 @@ def wait_for_turn_ready(
 
         deadline = None if timeout is None else time.monotonic() + timeout
         with state.condition:
-            while not state.is_ready_for_turn(turn):
+            while not state.is_ready_for_turn():
                 if deadline is not None:
                     remaining_timeout = deadline - time.monotonic()
                     if remaining_timeout <= 0:
-                        return state.is_ready_for_turn(turn)
+                        return state.is_ready_for_turn()
                     remaining_timeout = max(MIN_TIMEOUT_SECONDS, remaining_timeout)
                 else:
                     remaining_timeout = None
@@ -298,34 +284,24 @@ def wait_for_turn_issued(
         so the pipeline would enqueue subsequent turns before the load generator has
         registered the current one as in-flight.
 
+        All conversation states are pre-created by the scheduler before pipeline
+        threads start, so this method can look up the state directly without waiting
+        for it to be registered.
+
         Args:
             conversation_id: Conversation to wait for.
             min_issued: Minimum number of issued turns to wait for.
             timeout: Maximum seconds to wait (None = infinite).
 
         Returns:
             True if condition met, False if timeout.
+
+        Raises:
+            KeyError: If conversation_id not found (programming error — state must be
+                pre-created by the scheduler before pipeline threads are spawned).
         """
+        state = self._conversations[conversation_id]
         deadline = None if timeout is None else time.monotonic() + timeout
-
-        # Phase 1: wait until the conversation state exists (guarded by
-        # _created_condition so only the threads waiting on *this* conversation
-        # being created are woken — not all pipeline threads).
-        with self._created_condition:
-            state = self._conversations.get(conversation_id)
-            while state is None:
-                if deadline is not None:
-                    remaining = deadline - time.monotonic()
-                    if remaining <= 0:
-                        return False
-                    self._created_condition.wait(
-                        timeout=max(MIN_TIMEOUT_SECONDS, remaining)
-                    )
-                else:
-                    self._created_condition.wait()
-                state = self._conversations.get(conversation_id)
-
-        # Phase 2: wait for the issued counter using the per-conversation Condition.
         with state.condition:
             while state.issued_client_turns < min_issued:
                 if deadline is not None:
@@ -384,8 +360,7 @@ def mark_turn_complete(
             state = self._conversations.get(conversation_id)
             if state is None:
                 raise KeyError(f"Conversation {conversation_id} not initialized")
-        with state.condition:
-            state.add_assistant_turn(response if store_in_history else None)
+        state.add_assistant_turn(response if store_in_history else None)
 
     def mark_turn_failed(self, conversation_id: str):
         """Mark that assistant response failed (error/timeout).
diff --git a/src/inference_endpoint/load_generator/scheduler.py b/src/inference_endpoint/load_generator/scheduler.py
@@ -23,7 +23,7 @@
 from typing import Any
 
 from ..config.runtime_settings import RuntimeSettings
-from ..config.schema import ConversationMode, LoadPatternType, MultiTurnConfig
+from ..config.schema import LoadPatternType, MultiTurnConfig
 from .conversation_manager import ConversationManager
 from .sample import SampleEvent, SampleEventHandler
 
@@ -438,9 +438,6 @@ def __iter__(self):
             yield s_idx, 0
 
 
-# Sentinel value to signal "block until previous turn completes"
-BLOCK_ON_PREVIOUS_TURN = -1
-
 # Sentinel object pushed by each independent-mode pipeline thread when it exhausts its turns
 _PIPELINE_DONE = object()
 
@@ -449,9 +446,8 @@ class MultiTurnScheduler(Scheduler, load_pattern=LoadPatternType.MULTI_TURN):
     """Scheduler for multi-turn conversations with turn sequencing and optional concurrency control.
 
     Enforces turn ordering within conversations: turn N+1 cannot be issued
-    until turn N completes. Supports multiple conversation scheduling modes:
-    - INDEPENDENT: Per-conv pipelines; no cross-conversation turn barrier (default)
-    - POISSON: Start conversations with Poisson arrival, sequence turns within
+    until turn N completes. Each conversation runs an independent pipeline thread
+    that gates on wait_for_turn_ready before enqueuing the next turn.
 
     Optionally limits total in-flight requests across all conversations via
     target_concurrency parameter, combining turn sequencing with concurrency control.
@@ -526,19 +522,9 @@ def __iter__(self):
         """Iterate with turn sequencing enforcement and optional concurrency control.
 
         Yields (sample_index, delay_ns) pairs. Turn blocking is handled inside
-        pipeline threads (INDEPENDENT mode). The concurrency gate is applied here
-        if target_concurrency is set.
+        pipeline threads. The concurrency gate is applied here if target_concurrency is set.
         """
-        mode = ConversationMode.INDEPENDENT
-        if self.multi_turn_config is not None:
-            mode = self.multi_turn_config.mode
-
-        if mode == ConversationMode.POISSON:
-            schedule = self._poisson_schedule()
-        else:
-            schedule = self._independent_schedule()
-
-        for s_idx, delay_ns in schedule:
+        for s_idx, delay_ns in self._independent_schedule():
             # Block on concurrency limit if enabled
             if self._condition is not None:
                 with self._condition:
@@ -548,19 +534,6 @@ def __iter__(self):
 
             yield s_idx, delay_ns
 
-    def _poisson_schedule(self):
-        """Start conversations with Poisson arrival, sequence turns within.
-
-        TODO: Implement Poisson conversation arrival.
-        For now, fall back to parallel mode.
-        """
-        # TODO: Implement Poisson conversation arrival
-        # For now, fall back to independent
-        logger.warning(
-            "Poisson conversation mode not yet implemented, using INDEPENDENT mode"
-        )
-        return self._independent_schedule()
-
     def _independent_schedule(self):
         """Issue turns per-conversation independently — no cross-conversation turn barriers.
 
@@ -590,6 +563,13 @@ def _independent_schedule(self):
             conv_id = sample_meta["conversation_id"]
             conv_samples[conv_id].append((sample_index, sample_meta["turn"]))
 
+        # Pre-create all conversation states before spawning pipeline threads so that
+        # wait_for_turn_issued can look them up directly without a creation race.
+        for conv_id, turns in conv_samples.items():
+            self.conversation_manager.get_or_create(
+                conv_id, expected_client_turns=len(turns)
+            )
+
         num_pipelines = len(conv_samples)
         if num_pipelines > _PIPELINE_THREAD_WARNING_THRESHOLD:
             logger.warning(
diff --git a/tests/integration/test_multi_turn.py b/tests/integration/test_multi_turn.py
diff --git a/tests/unit/load_generator/test_multi_turn_conversation_manager.py b/tests/unit/load_generator/test_multi_turn_conversation_manager.py
diff --git a/tests/unit/load_generator/test_multi_turn_scheduler.py b/tests/unit/load_generator/test_multi_turn_scheduler.py