From 9bb9ac89c5d3c84f4e0d7dd464757575259ff04b Mon Sep 17 00:00:00 2001 From: Mark Sturdevant Date: Tue, 31 Mar 2026 21:00:41 -0700 Subject: [PATCH 1/6] feat(serve): improve OpenAI API compatibility with usage, finish_reason, and system_fingerprint Add missing OpenAI API fields to m serve chat completion responses: - Add `finish_reason` field to Choice model (defaults to "stop") - Add `usage` field with token counts (prompt_tokens, completion_tokens, total_tokens) extracted from ModelOutputThunk.usage when available - Add `system_fingerprint` field populated from model or provider metadata - Fix bug in model_options extraction (use model_dump().items() instead of iterating request) Includes comprehensive test coverage with 13 unit tests verifying all new fields and edge cases (missing data, partial data, fallback behavior). Signed-off-by: Mark Sturdevant --- cli/serve/app.py | 31 +++++- cli/serve/models.py | 23 ++++ test/cli/test_serve.py | 243 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 295 insertions(+), 2 deletions(-) create mode 100644 test/cli/test_serve.py diff --git a/cli/serve/app.py b/cli/serve/app.py index d126f2261..d948225b1 100644 --- a/cli/serve/app.py +++ b/cli/serve/app.py @@ -10,7 +10,13 @@ import uvicorn from fastapi import FastAPI -from .models import ChatCompletion, ChatCompletionMessage, ChatCompletionRequest, Choice +from .models import ( + ChatCompletion, + ChatCompletionMessage, + ChatCompletionRequest, + Choice, + CompletionUsage, +) app = FastAPI( title="M serve OpenAI API Compatible Server", @@ -40,10 +46,28 @@ async def endpoint(request: ChatCompletionRequest) -> ChatCompletion: input=request.messages, requirements=request.requirements, model_options={ - k: v for k, v in request if k not in ["messages", "requirements"] + k: v + for k, v in request.model_dump().items() + if k not in ["messages", "requirements"] }, ) + # Extract usage information from the ModelOutputThunk if available + usage = None + if hasattr(output, "usage") and output.usage is not None: + usage = CompletionUsage( + prompt_tokens=output.usage.get("prompt_tokens", 0), + completion_tokens=output.usage.get("completion_tokens", 0), + total_tokens=output.usage.get("total_tokens", 0), + ) + + # Extract system_fingerprint (model identifier) if available + system_fingerprint = None + if hasattr(output, "model") and output.model is not None: + system_fingerprint = output.model + elif hasattr(output, "provider") and output.provider is not None: + system_fingerprint = output.provider + return ChatCompletion( id=completion_id, model=request.model, @@ -54,9 +78,12 @@ async def endpoint(request: ChatCompletionRequest) -> ChatCompletion: message=ChatCompletionMessage( content=output.value, role="assistant" ), + finish_reason="stop", ) ], object="chat.completion", # type: ignore + system_fingerprint=system_fingerprint, + usage=usage, ) # type: ignore endpoint.__name__ = f"chat_{module.__name__}_endpoint" diff --git a/cli/serve/models.py b/cli/serve/models.py index 9d9a7cc51..1888e2d00 100644 --- a/cli/serve/models.py +++ b/cli/serve/models.py @@ -82,6 +82,23 @@ class Choice(BaseModel): message: ChatCompletionMessage """A chat completion message generated by the model.""" + finish_reason: ( + Literal["stop", "length", "content_filter", "tool_calls", "function_call"] + | None + ) = "stop" + """The reason the model stopped generating tokens.""" + + +class CompletionUsage(BaseModel): + completion_tokens: int + """Number of tokens in the generated completion.""" + + prompt_tokens: int + """Number of tokens in the prompt.""" + + total_tokens: int + """Total number of tokens used in the request (prompt + completion).""" + class ChatCompletion(BaseModel): id: str @@ -101,3 +118,9 @@ class ChatCompletion(BaseModel): object: Literal["chat.completion"] """The object type, which is always `chat.completion`.""" + + system_fingerprint: str | None = None + """This fingerprint represents the backend configuration that the model runs with.""" + + usage: CompletionUsage | None = None + """Usage statistics for the completion request.""" diff --git a/test/cli/test_serve.py b/test/cli/test_serve.py new file mode 100644 index 000000000..1d658138c --- /dev/null +++ b/test/cli/test_serve.py @@ -0,0 +1,243 @@ +"""Tests for the m serve OpenAI-compatible API server.""" + +from unittest.mock import Mock, patch + +import pytest + +from cli.serve.app import make_chat_endpoint +from cli.serve.models import ( + ChatCompletion, + ChatCompletionRequest, + ChatMessage, + CompletionUsage, +) +from mellea.core.base import ModelOutputThunk + + +@pytest.fixture +def mock_module(): + """Create a mock module with a serve function.""" + module = Mock() + module.__name__ = "test_module" + return module + + +@pytest.fixture +def sample_request(): + """Create a sample ChatCompletionRequest.""" + return ChatCompletionRequest( + model="test-model", + messages=[ChatMessage(role="user", content="Hello, world!")], + temperature=0.7, + max_tokens=100, + ) + + +class TestChatEndpoint: + """Tests for the chat completion endpoint.""" + + @pytest.mark.asyncio + async def test_basic_completion(self, mock_module, sample_request): + """Test basic chat completion returns correct structure.""" + # Setup mock output + mock_output = ModelOutputThunk("Hello! How can I help you?") + mock_module.serve.return_value = mock_output + + # Create endpoint and call it + endpoint = make_chat_endpoint(mock_module) + response = await endpoint(sample_request) + + # Verify response structure + assert isinstance(response, ChatCompletion) + assert response.model == "test-model" + assert len(response.choices) == 1 + assert response.choices[0].message.content == "Hello! How can I help you?" + assert response.choices[0].message.role == "assistant" + assert response.choices[0].index == 0 + + @pytest.mark.asyncio + async def test_finish_reason_included(self, mock_module, sample_request): + """Test that finish_reason is included in the response.""" + mock_output = ModelOutputThunk("Test response") + mock_module.serve.return_value = mock_output + + endpoint = make_chat_endpoint(mock_module) + response = await endpoint(sample_request) + + assert response.choices[0].finish_reason == "stop" + + @pytest.mark.asyncio + async def test_usage_field_populated(self, mock_module, sample_request): + """Test that usage field is populated when available.""" + mock_output = ModelOutputThunk("Test response") + mock_output.usage = { + "prompt_tokens": 10, + "completion_tokens": 5, + "total_tokens": 15, + } + mock_module.serve.return_value = mock_output + + endpoint = make_chat_endpoint(mock_module) + response = await endpoint(sample_request) + + assert response.usage is not None + assert isinstance(response.usage, CompletionUsage) + assert response.usage.prompt_tokens == 10 + assert response.usage.completion_tokens == 5 + assert response.usage.total_tokens == 15 + + @pytest.mark.asyncio + async def test_usage_field_none_when_unavailable(self, mock_module, sample_request): + """Test that usage field is None when not available.""" + mock_output = ModelOutputThunk("Test response") + # Don't set usage field + mock_module.serve.return_value = mock_output + + endpoint = make_chat_endpoint(mock_module) + response = await endpoint(sample_request) + + assert response.usage is None + + @pytest.mark.asyncio + async def test_system_fingerprint_from_model(self, mock_module, sample_request): + """Test that system_fingerprint is populated from model field.""" + mock_output = ModelOutputThunk("Test response") + mock_output.model = "gpt-4-turbo" + mock_module.serve.return_value = mock_output + + endpoint = make_chat_endpoint(mock_module) + response = await endpoint(sample_request) + + assert response.system_fingerprint == "gpt-4-turbo" + + @pytest.mark.asyncio + async def test_system_fingerprint_from_provider(self, mock_module, sample_request): + """Test that system_fingerprint falls back to provider field.""" + mock_output = ModelOutputThunk("Test response") + mock_output.provider = "ollama" + mock_module.serve.return_value = mock_output + + endpoint = make_chat_endpoint(mock_module) + response = await endpoint(sample_request) + + assert response.system_fingerprint == "ollama" + + @pytest.mark.asyncio + async def test_system_fingerprint_none_when_unavailable( + self, mock_module, sample_request + ): + """Test that system_fingerprint is None when not available.""" + mock_output = ModelOutputThunk("Test response") + # Don't set model or provider + mock_module.serve.return_value = mock_output + + endpoint = make_chat_endpoint(mock_module) + response = await endpoint(sample_request) + + assert response.system_fingerprint is None + + @pytest.mark.asyncio + async def test_model_options_passed_correctly(self, mock_module, sample_request): + """Test that model options are passed to serve function correctly.""" + mock_output = ModelOutputThunk("Test response") + mock_module.serve.return_value = mock_output + + endpoint = make_chat_endpoint(mock_module) + await endpoint(sample_request) + + # Verify serve was called with correct arguments + call_args = mock_module.serve.call_args + assert call_args is not None + assert "model_options" in call_args.kwargs + model_options = call_args.kwargs["model_options"] + + # Should include temperature and max_tokens but not messages/requirements + assert "temperature" in model_options + assert model_options["temperature"] == 0.7 + assert "max_tokens" in model_options + assert model_options["max_tokens"] == 100 + assert "messages" not in model_options + assert "requirements" not in model_options + + @pytest.mark.asyncio + async def test_completion_id_format(self, mock_module, sample_request): + """Test that completion ID follows OpenAI format.""" + mock_output = ModelOutputThunk("Test response") + mock_module.serve.return_value = mock_output + + endpoint = make_chat_endpoint(mock_module) + response = await endpoint(sample_request) + + # Should start with "chatcmpl-" and be 38 characters total (chatcmpl- + 29 hex chars) + assert response.id.startswith("chatcmpl-") + assert len(response.id) == 38 + + @pytest.mark.asyncio + async def test_created_timestamp_present(self, mock_module, sample_request): + """Test that created timestamp is present and reasonable.""" + mock_output = ModelOutputThunk("Test response") + mock_module.serve.return_value = mock_output + + endpoint = make_chat_endpoint(mock_module) + response = await endpoint(sample_request) + + # Should be a Unix timestamp (positive integer) + assert isinstance(response.created, int) + assert response.created > 0 + + @pytest.mark.asyncio + async def test_object_type_correct(self, mock_module, sample_request): + """Test that object type is set correctly.""" + mock_output = ModelOutputThunk("Test response") + mock_module.serve.return_value = mock_output + + endpoint = make_chat_endpoint(mock_module) + response = await endpoint(sample_request) + + assert response.object == "chat.completion" + + @pytest.mark.asyncio + async def test_usage_with_partial_data(self, mock_module, sample_request): + """Test that usage handles missing fields gracefully.""" + mock_output = ModelOutputThunk("Test response") + # Only provide some fields + mock_output.usage = { + "prompt_tokens": 10 + # Missing completion_tokens and total_tokens + } + mock_module.serve.return_value = mock_output + + endpoint = make_chat_endpoint(mock_module) + response = await endpoint(sample_request) + + assert response.usage is not None + assert response.usage.prompt_tokens == 10 + assert response.usage.completion_tokens == 0 # Should default to 0 + assert response.usage.total_tokens == 0 + + @pytest.mark.asyncio + async def test_all_fields_together(self, mock_module, sample_request): + """Test that all new fields work together correctly.""" + mock_output = ModelOutputThunk("Complete response") + mock_output.usage = { + "prompt_tokens": 20, + "completion_tokens": 10, + "total_tokens": 30, + } + mock_output.model = "gpt-4" + mock_output.provider = "openai" + mock_module.serve.return_value = mock_output + + endpoint = make_chat_endpoint(mock_module) + response = await endpoint(sample_request) + + # Verify all fields are present + assert response.choices[0].finish_reason == "stop" + assert response.usage is not None + assert response.usage.total_tokens == 30 + assert response.system_fingerprint == "gpt-4" # model takes precedence + assert response.object == "chat.completion" + assert response.id.startswith("chatcmpl-") + + +# Made with Bob From 97356b0f6128618a9ce6564cb9ef0a222190dfb7 Mon Sep 17 00:00:00 2001 From: Mark Sturdevant Date: Wed, 1 Apr 2026 12:46:40 -0700 Subject: [PATCH 2/6] Remove made with Bob comment Signed-off-by: Mark Sturdevant --- test/cli/test_serve.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/test/cli/test_serve.py b/test/cli/test_serve.py index 1d658138c..db23d6f41 100644 --- a/test/cli/test_serve.py +++ b/test/cli/test_serve.py @@ -238,6 +238,3 @@ async def test_all_fields_together(self, mock_module, sample_request): assert response.system_fingerprint == "gpt-4" # model takes precedence assert response.object == "chat.completion" assert response.id.startswith("chatcmpl-") - - -# Made with Bob From 9649315812cb15e941a0809945e838f9d471165e Mon Sep 17 00:00:00 2001 From: Mark Sturdevant Date: Wed, 1 Apr 2026 13:39:55 -0700 Subject: [PATCH 3/6] fix(serve): set system_fingerprint to None per OpenAI spec system_fingerprint should be a backend config hash, not the model name. The model name is already in response.model. Set to None since we don't currently track backend config fingerprints. Signed-off-by: Mark Sturdevant --- cli/serve/app.py | 8 +++----- test/cli/test_serve.py | 41 ++++++++++++----------------------------- 2 files changed, 15 insertions(+), 34 deletions(-) diff --git a/cli/serve/app.py b/cli/serve/app.py index d948225b1..67077583f 100644 --- a/cli/serve/app.py +++ b/cli/serve/app.py @@ -61,12 +61,10 @@ async def endpoint(request: ChatCompletionRequest) -> ChatCompletion: total_tokens=output.usage.get("total_tokens", 0), ) - # Extract system_fingerprint (model identifier) if available + # system_fingerprint represents backend config hash, not model name + # The model name is already in response.model (line 73) + # Leave as None since we don't track backend config fingerprints yet system_fingerprint = None - if hasattr(output, "model") and output.model is not None: - system_fingerprint = output.model - elif hasattr(output, "provider") and output.provider is not None: - system_fingerprint = output.provider return ChatCompletion( id=completion_id, diff --git a/test/cli/test_serve.py b/test/cli/test_serve.py index db23d6f41..bcaaf2a5a 100644 --- a/test/cli/test_serve.py +++ b/test/cli/test_serve.py @@ -99,42 +99,25 @@ async def test_usage_field_none_when_unavailable(self, mock_module, sample_reque assert response.usage is None @pytest.mark.asyncio - async def test_system_fingerprint_from_model(self, mock_module, sample_request): - """Test that system_fingerprint is populated from model field.""" - mock_output = ModelOutputThunk("Test response") - mock_output.model = "gpt-4-turbo" - mock_module.serve.return_value = mock_output - - endpoint = make_chat_endpoint(mock_module) - response = await endpoint(sample_request) - - assert response.system_fingerprint == "gpt-4-turbo" - - @pytest.mark.asyncio - async def test_system_fingerprint_from_provider(self, mock_module, sample_request): - """Test that system_fingerprint falls back to provider field.""" - mock_output = ModelOutputThunk("Test response") - mock_output.provider = "ollama" - mock_module.serve.return_value = mock_output - - endpoint = make_chat_endpoint(mock_module) - response = await endpoint(sample_request) + async def test_system_fingerprint_always_none(self, mock_module, sample_request): + """Test that system_fingerprint is always None. - assert response.system_fingerprint == "ollama" - - @pytest.mark.asyncio - async def test_system_fingerprint_none_when_unavailable( - self, mock_module, sample_request - ): - """Test that system_fingerprint is None when not available.""" + Per OpenAI spec, system_fingerprint represents a hash of backend config, + not the model name. The model name is in response.model. + We don't currently track backend config fingerprints. + """ mock_output = ModelOutputThunk("Test response") - # Don't set model or provider + mock_output.model = "gpt-4-turbo" + mock_output.provider = "openai" mock_module.serve.return_value = mock_output endpoint = make_chat_endpoint(mock_module) response = await endpoint(sample_request) + # system_fingerprint should be None, not the model name assert response.system_fingerprint is None + # Model name should be in the model field + assert response.model == sample_request.model @pytest.mark.asyncio async def test_model_options_passed_correctly(self, mock_module, sample_request): @@ -235,6 +218,6 @@ async def test_all_fields_together(self, mock_module, sample_request): assert response.choices[0].finish_reason == "stop" assert response.usage is not None assert response.usage.total_tokens == 30 - assert response.system_fingerprint == "gpt-4" # model takes precedence + assert response.system_fingerprint is None # Not tracking backend config assert response.object == "chat.completion" assert response.id.startswith("chatcmpl-") From 43b96565b666f02e31ebb37c7a44d3cbea9796d8 Mon Sep 17 00:00:00 2001 From: Mark Sturdevant Date: Wed, 1 Apr 2026 13:49:36 -0700 Subject: [PATCH 4/6] fix: remove unused import Signed-off-by: Mark Sturdevant --- test/cli/test_serve.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/cli/test_serve.py b/test/cli/test_serve.py index bcaaf2a5a..3c76ed3dc 100644 --- a/test/cli/test_serve.py +++ b/test/cli/test_serve.py @@ -1,6 +1,6 @@ """Tests for the m serve OpenAI-compatible API server.""" -from unittest.mock import Mock, patch +from unittest.mock import Mock import pytest From cbe3eac21294ef497a459b3469f84aa22261966e Mon Sep 17 00:00:00 2001 From: Mark Sturdevant Date: Wed, 1 Apr 2026 14:27:21 -0700 Subject: [PATCH 5/6] fix: calculate total_tokens from prompt + completion when missing When partial usage data is provided (e.g., only prompt_tokens), total_tokens was incorrectly defaulting to 0. Now it's calculated as prompt_tokens + completion_tokens when not explicitly provided. This prevents silent bad values from going out over the OpenAI-compatible API. - Modified cli/serve/app.py to calculate total_tokens when missing - Updated test_usage_with_partial_data to expect correct behavior Signed-off-by: Mark Sturdevant --- cli/serve/app.py | 12 +++++++++--- test/cli/test_serve.py | 4 +++- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/cli/serve/app.py b/cli/serve/app.py index 67077583f..50b4777b6 100644 --- a/cli/serve/app.py +++ b/cli/serve/app.py @@ -55,10 +55,16 @@ async def endpoint(request: ChatCompletionRequest) -> ChatCompletion: # Extract usage information from the ModelOutputThunk if available usage = None if hasattr(output, "usage") and output.usage is not None: + prompt_tokens = output.usage.get("prompt_tokens", 0) + completion_tokens = output.usage.get("completion_tokens", 0) + # Calculate total_tokens if not provided + total_tokens = output.usage.get( + "total_tokens", prompt_tokens + completion_tokens + ) usage = CompletionUsage( - prompt_tokens=output.usage.get("prompt_tokens", 0), - completion_tokens=output.usage.get("completion_tokens", 0), - total_tokens=output.usage.get("total_tokens", 0), + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=total_tokens, ) # system_fingerprint represents backend config hash, not model name diff --git a/test/cli/test_serve.py b/test/cli/test_serve.py index 3c76ed3dc..85f3bd9f1 100644 --- a/test/cli/test_serve.py +++ b/test/cli/test_serve.py @@ -196,7 +196,9 @@ async def test_usage_with_partial_data(self, mock_module, sample_request): assert response.usage is not None assert response.usage.prompt_tokens == 10 assert response.usage.completion_tokens == 0 # Should default to 0 - assert response.usage.total_tokens == 0 + assert ( + response.usage.total_tokens == 10 + ) # Should be prompt_tokens + completion_tokens @pytest.mark.asyncio async def test_all_fields_together(self, mock_module, sample_request): From 191a0c802200a36b18e2a9e6fd35c939c6e1afcd Mon Sep 17 00:00:00 2001 From: Mark Sturdevant Date: Wed, 1 Apr 2026 14:38:12 -0700 Subject: [PATCH 6/6] test: make completion ID format test less brittle Replace hardcoded length assertion with implementation-agnostic check. The test now validates that the ID has the correct prefix and a non-empty suffix, without coupling to specific length or format details. Signed-off-by: Mark Sturdevant --- test/cli/test_serve.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/cli/test_serve.py b/test/cli/test_serve.py index 85f3bd9f1..584be0876 100644 --- a/test/cli/test_serve.py +++ b/test/cli/test_serve.py @@ -151,9 +151,9 @@ async def test_completion_id_format(self, mock_module, sample_request): endpoint = make_chat_endpoint(mock_module) response = await endpoint(sample_request) - # Should start with "chatcmpl-" and be 38 characters total (chatcmpl- + 29 hex chars) + # Should start with "chatcmpl-" and have a non-empty suffix assert response.id.startswith("chatcmpl-") - assert len(response.id) == 38 + assert len(response.id) > len("chatcmpl-"), "ID should have a suffix" @pytest.mark.asyncio async def test_created_timestamp_present(self, mock_module, sample_request):