mistral - latency

pgrayy · pgrayy · commit e68d9d8da6c6 · 2025-09-09T12:30:30.000-04:00
diff --git a/src/strands/models/mistral.py b/src/strands/models/mistral.py
@@ -6,6 +6,7 @@
 import base64
 import json
 import logging
+import time
 from typing import Any, AsyncGenerator, Iterable, Optional, Type, TypeVar, Union
 
 import mistralai
@@ -334,7 +335,8 @@ def format_chunk(self, event: dict[str, Any]) -> StreamEvent:
                 return {"messageStop": {"stopReason": reason}}
 
             case "metadata":
-                usage = event["data"]
+                usage = event["data"]["usage"]
+                metrics = event["data"]["metrics"]
                 return {
                     "metadata": {
                         "usage": {
@@ -343,7 +345,7 @@ def format_chunk(self, event: dict[str, Any]) -> StreamEvent:
                             "totalTokens": usage.total_tokens,
                         },
                         "metrics": {
-                            "latencyMs": event.get("latency_ms", 0),
+                            "latencyMs": metrics["latency"] * 1000,
                         },
                     },
                 }
@@ -360,6 +362,8 @@ def _handle_non_streaming_response(self, response: Any) -> Iterable[dict[str, An
         Yields:
             Formatted events that match the streaming format.
         """
+        start_time = time.time()
+
         yield {"chunk_type": "message_start"}
 
         content_started = False
@@ -389,7 +393,12 @@ def _handle_non_streaming_response(self, response: Any) -> Iterable[dict[str, An
             yield {"chunk_type": "message_stop", "data": finish_reason}
 
         if hasattr(response, "usage") and response.usage:
-            yield {"chunk_type": "metadata", "data": response.usage}
+            end_time = time.time()
+            latency = end_time - start_time
+            yield {
+                "chunk_type": "metadata",
+                "data": {"usage": response.usage, "metrics": {"latency": latency}},
+            }
 
     @override
     async def stream(
@@ -434,6 +443,7 @@ async def stream(
 
             # Use the streaming API
             async with mistralai.Mistral(**self.client_args) as client:
+                start_time = time.time()
                 stream_response = await client.chat.stream_async(**request)
 
                 yield self.format_chunk({"chunk_type": "message_start"})
@@ -488,7 +498,14 @@ async def stream(
                             yield self.format_chunk({"chunk_type": "message_stop", "data": choice.finish_reason})
 
                             if hasattr(chunk, "usage"):
-                                yield self.format_chunk({"chunk_type": "metadata", "data": chunk.usage})
+                                end_time = time.time()
+                                latency = end_time - start_time
+                                yield self.format_chunk(
+                                    {
+                                        "chunk_type": "metadata",
+                                        "data": {"usage": chunk.usage, "metrics": {"latency": latency}},
+                                    }
+                                )
 
         except Exception as e:
             if "rate" in str(e).lower() or "429" in str(e):
diff --git a/tests/strands/models/test_mistral.py b/tests/strands/models/test_mistral.py
@@ -68,6 +68,12 @@ class TestOutputModel(pydantic.BaseModel):
     return TestOutputModel
 
 
+@pytest.fixture
+def mock_time():
+    with unittest.mock.patch.object(strands.models.mistral, "time") as mock:
+        yield mock.time
+
+
 def test__init__model_configs(mistral_client, model_id, max_tokens):
     _ = mistral_client
 
@@ -380,38 +386,12 @@ def test_format_chunk_metadata(model):
 
     event = {
         "chunk_type": "metadata",
-        "data": mock_usage,
-        "latency_ms": 250,
-    }
-
-    actual_chunk = model.format_chunk(event)
-    exp_chunk = {
-        "metadata": {
-            "usage": {
-                "inputTokens": 100,
-                "outputTokens": 50,
-                "totalTokens": 150,
-            },
-            "metrics": {
-                "latencyMs": 250,
-            },
+        "data": {
+            "usage": mock_usage,
+            "metrics": {"latency": 0.001},
         },
     }
 
-    assert actual_chunk == exp_chunk
-
-
-def test_format_chunk_metadata_no_latency(model):
-    mock_usage = unittest.mock.Mock()
-    mock_usage.prompt_tokens = 100
-    mock_usage.completion_tokens = 50
-    mock_usage.total_tokens = 150
-
-    event = {
-        "chunk_type": "metadata",
-        "data": mock_usage,
-    }
-
     actual_chunk = model.format_chunk(event)
     exp_chunk = {
         "metadata": {
@@ -421,7 +401,7 @@ def test_format_chunk_metadata_no_latency(model):
                 "totalTokens": 150,
             },
             "metrics": {
-                "latencyMs": 0,
+                "latencyMs": 1,
             },
         },
     }
@@ -437,7 +417,9 @@ def test_format_chunk_unknown(model):
 
 
 @pytest.mark.asyncio
-async def test_stream(mistral_client, model, agenerator, alist):
+async def test_stream(mistral_client, model, mock_time, agenerator, alist):
+    mock_time.side_effect = [0, 0.001]
+
     mock_usage = unittest.mock.Mock()
     mock_usage.prompt_tokens = 100
     mock_usage.completion_tokens = 50
@@ -458,10 +440,8 @@ async def test_stream(mistral_client, model, agenerator, alist):
     mistral_client.chat.stream_async = unittest.mock.AsyncMock(return_value=agenerator([mock_event]))
 
     messages = [{"role": "user", "content": [{"text": "test"}]}]
-    response = model.stream(messages, None, None)
-
-    # Consume the response
-    await alist(response)
+    stream = model.stream(messages, None, None)
+    responses = await alist(stream)
 
     expected_request = {
         "model": "mistral-large-latest",
@@ -472,6 +452,21 @@ async def test_stream(mistral_client, model, agenerator, alist):
 
     mistral_client.chat.stream_async.assert_called_once_with(**expected_request)
 
+    tru_metadata = responses[-1]
+    exp_metadata = {
+        "metadata": {
+            "usage": {
+                "inputTokens": 100,
+                "outputTokens": 50,
+                "totalTokens": 150,
+            },
+            "metrics": {
+                "latencyMs": 1,
+            },
+        },
+    }
+    assert tru_metadata == exp_metadata
+
 
 @pytest.mark.asyncio
 async def test_stream_rate_limit_error(mistral_client, model, alist):