Fix type annotation for msgspec serialization error, add tests to catch similar errors in the future

nv-alicheng · nv-alicheng · commit 2146b0e08694 · 2025-12-16T12:56:53.000-08:00
diff --git a/src/inference_endpoint/core/types.py b/src/inference_endpoint/core/types.py
@@ -48,6 +48,10 @@ class QueryStatus(Enum):
     CANCELLED = "cancelled"
 
 
+_OUTPUT_DICT_TYPE = dict[str, str | list[str]]
+_OUTPUT_RESULT_TYPE = str | tuple[str, ...] | _OUTPUT_DICT_TYPE | None
+
+
 class Query(msgspec.Struct, kw_only=True):
     """Represents a single inference query to be sent to an endpoint.
 
@@ -105,10 +109,10 @@ class QueryResult(msgspec.Struct, tag="query_result", kw_only=True, frozen=True)
     """
 
     id: str = ""
-    response_output: str | tuple[str, ...] | None = None
+    response_output: _OUTPUT_RESULT_TYPE = None
     metadata: dict[str, Any] = msgspec.field(default_factory=dict)
     error: str | None = None
-    completed_at: float = msgspec.UNSET
+    completed_at: int = msgspec.UNSET
 
     def __post_init__(self):
         """Set completion timestamp automatically.
@@ -122,6 +126,9 @@ def __post_init__(self):
         """
         # Disallow user setting completed_at time to prevent cheating.
         # Timestamp must be generated internally
+        # Note that this will also be regenerated during encode+decode. This is
+        # intentional, since timestamps in child and parent processes may be different
+        # due to how monotonic_ns works.
         msgspec.structs.force_setattr(self, "completed_at", time.monotonic_ns())
 
         # A list can be passed on, but we need to convert it to a tuple to maintain immutability,
diff --git a/src/inference_endpoint/endpoint_client/worker.py b/src/inference_endpoint/endpoint_client/worker.py
@@ -462,6 +462,9 @@ async def _handle_streaming_request(self, query: Query) -> None:
 
             # Send final complete response
             if reasoning_chunks:
+                # If there are reasoning chunks, then the first chunk received
+                # is the first reasoning chunk. The rest of the reasoning chunks,
+                # as well as the output chunks can be joined together.
                 resp_reasoning = [reasoning_chunks[0]]
                 if len(reasoning_chunks) > 1:
                     resp_reasoning.append("".join(reasoning_chunks[1:]))
@@ -470,6 +473,8 @@ async def _handle_streaming_request(self, query: Query) -> None:
                     "reasoning": resp_reasoning,
                 }
             elif output_chunks:
+                # If there are only output chunks, the first chunk is the used for
+                # TTFT calculations. The rest are joined together.
                 resp_output = [output_chunks[0]]
                 if len(output_chunks) > 1:
                     resp_output.append("".join(output_chunks[1:]))
diff --git a/tests/unit/core/test_types.py b/tests/unit/core/test_types.py