[https://nvbugs/6245861][fix] Gate the two ID None-checks on finish_reason in _GEN_PENDING_FINISH_REASONS… (#14908)

tensorrt-cicd · web-flow · commit f3b718ad7469 · 2026-06-16T10:31:52.000-04:00
Signed-off-by: tensorrt-cicd &lt;90828364+tensorrt-cicd@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/serve/openai_disagg_service.py b/tensorrt_llm/serve/openai_disagg_service.py
@@ -46,6 +46,10 @@
 )
 from tensorrt_llm.serve.router import KvCacheAwareRouter, Router
 
+# Finish reasons for which a GEN handoff is still pending; any other reason means
+# the CTX request already completed and the disagg KV-cache handoff was never set up.
+_GEN_PENDING_FINISH_REASONS = ("length", "not_finished")
+
 
 class OpenAIDisaggregatedService(OpenAIService):
     def __init__(
@@ -174,7 +178,7 @@ async def _send_disagg_request_ctx_first(
             return ctx_response
 
     def _need_gen(self, response: UCompletionResponse) -> bool:
-        if response and response.choices[0].finish_reason not in ["length", "not_finished"]:
+        if response and response.choices[0].finish_reason not in _GEN_PENDING_FINISH_REASONS:
             del response.choices[0].disaggregated_params
             return False
         return True
@@ -384,24 +388,28 @@ async def _on_worker_event(self, worker_info: WorkerInfo, event_type: WatchEvent
     async def _verify_ctx_response(self, ctx_response: UCompletionResponse) -> None:
         if ctx_response:
             for idx, choice in enumerate(ctx_response.choices):
-                choice = ctx_response.choices[idx]
                 if choice.disaggregated_params is None:
                     raise ValueError(
                         f"Context server choice {idx} did not return disaggregated params."
                         f" finish_reason={choice.finish_reason!r}"
                     )
-                if choice.disaggregated_params.ctx_request_id is None:
-                    raise ValueError(
-                        f"Invalid disaggregated params: ctx_request_id is None for choice {idx}."
-                        f" finish_reason={choice.finish_reason!r},"
-                        f" disagg_request_id={choice.disaggregated_params.disagg_request_id!r}"
-                    )
-                if choice.disaggregated_params.disagg_request_id is None:
-                    raise ValueError(
-                        f"Invalid disaggregated params: disagg_request_id is None for choice {idx}."
-                        f" finish_reason={choice.finish_reason!r},"
-                        f" ctx_request_id={choice.disaggregated_params.ctx_request_id!r}"
-                    )
+                # A CTX request that finished early (e.g. EOS during prefill) never
+                # sets up the KV-cache handoff, so ctx_request_id/disagg_request_id
+                # stay None. Only enforce them when a GEN handoff is still pending --
+                # mirroring _need_gen, which skips the handoff for these responses.
+                if choice.finish_reason in _GEN_PENDING_FINISH_REASONS:
+                    if choice.disaggregated_params.ctx_request_id is None:
+                        raise ValueError(
+                            f"Invalid disaggregated params: ctx_request_id is None for choice {idx}."
+                            f" finish_reason={choice.finish_reason!r},"
+                            f" disagg_request_id={choice.disaggregated_params.disagg_request_id!r}"
+                        )
+                    if choice.disaggregated_params.disagg_request_id is None:
+                        raise ValueError(
+                            f"Invalid disaggregated params: disagg_request_id is None for choice {idx}."
+                            f" finish_reason={choice.finish_reason!r},"
+                            f" ctx_request_id={choice.disaggregated_params.ctx_request_id!r}"
+                        )
             return ctx_response
 
     async def _send_disagg_request_gen_first(
diff --git a/tests/unittest/disaggregated/test_openai_disagg_service.py b/tests/unittest/disaggregated/test_openai_disagg_service.py
@@ -429,7 +429,7 @@ async def test_missing_ctx_request_id_includes_disagg_id(self):
     @pytest.mark.asyncio
     async def test_missing_disagg_request_id_includes_ctx_id(self):
         svc = _make_service("context_first")
-        resp = _make_completion_response("", finish_reason="stop", disagg_request_id=555)
+        resp = _make_completion_response("", finish_reason="length", disagg_request_id=555)
         resp.choices[0].disaggregated_params.disagg_request_id = None
         resp.choices[0].disaggregated_params.ctx_request_id = 555
         with pytest.raises(ValueError, match=r"disagg_request_id is None.*555"):
@@ -442,6 +442,24 @@ async def test_valid_response_passes(self):
         result = await svc._verify_ctx_response(resp)
         assert result is resp
 
+    @pytest.mark.asyncio
+    async def test_completed_response_with_null_ctx_request_id_passes(self):
+        # CTX finished early (finish_reason='stop'): no GEN handoff was set up,
+        # so ctx_request_id is None. The verifier must accept it (NVBug 6245861).
+        svc = _make_service("context_first")
+        resp = _make_completion_response("", finish_reason="stop", disagg_request_id=42)
+        resp.choices[0].disaggregated_params.ctx_request_id = None
+        result = await svc._verify_ctx_response(resp)
+        assert result is resp
+
+    @pytest.mark.asyncio
+    async def test_completed_response_with_null_disagg_request_id_passes(self):
+        svc = _make_service("context_first")
+        resp = _make_completion_response("", finish_reason="stop", disagg_request_id=42)
+        resp.choices[0].disaggregated_params.disagg_request_id = None
+        result = await svc._verify_ctx_response(resp)
+        assert result is resp
+
 
 class TestFirstGenLogProbsSerializeRoundtrip:
     """Roundtrip tests for _serialize/_deserialize_first_gen_log_probs."""