From 9bb9ac89c5d3c84f4e0d7dd464757575259ff04b Mon Sep 17 00:00:00 2001
From: Mark Sturdevant <mark.sturdevant@ibm.com>
Date: Tue, 31 Mar 2026 21:00:41 -0700
Subject: [PATCH 1/6] feat(serve): improve OpenAI API compatibility with usage,
 finish_reason, and system_fingerprint

Add missing OpenAI API fields to m serve chat completion responses:

- Add `finish_reason` field to Choice model (defaults to "stop")
- Add `usage` field with token counts (prompt_tokens, completion_tokens, total_tokens)
  extracted from ModelOutputThunk.usage when available
- Add `system_fingerprint` field populated from model or provider metadata
- Fix bug in model_options extraction (use model_dump().items() instead of iterating request)

Includes comprehensive test coverage with 13 unit tests verifying all new fields
and edge cases (missing data, partial data, fallback behavior).

Signed-off-by: Mark Sturdevant <mark.sturdevant@ibm.com>
---
 cli/serve/app.py       |  31 +++++-
 cli/serve/models.py    |  23 ++++
 test/cli/test_serve.py | 243 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 295 insertions(+), 2 deletions(-)
 create mode 100644 test/cli/test_serve.py

diff --git a/cli/serve/app.py b/cli/serve/app.py
index d126f2261..d948225b1 100644
--- a/cli/serve/app.py
+++ b/cli/serve/app.py
@@ -10,7 +10,13 @@
 import uvicorn
 from fastapi import FastAPI
 
-from .models import ChatCompletion, ChatCompletionMessage, ChatCompletionRequest, Choice
+from .models import (
+    ChatCompletion,
+    ChatCompletionMessage,
+    ChatCompletionRequest,
+    Choice,
+    CompletionUsage,
+)
 
 app = FastAPI(
     title="M serve OpenAI API Compatible Server",
@@ -40,10 +46,28 @@ async def endpoint(request: ChatCompletionRequest) -> ChatCompletion:
             input=request.messages,
             requirements=request.requirements,
             model_options={
-                k: v for k, v in request if k not in ["messages", "requirements"]
+                k: v
+                for k, v in request.model_dump().items()
+                if k not in ["messages", "requirements"]
             },
         )
 
+        # Extract usage information from the ModelOutputThunk if available
+        usage = None
+        if hasattr(output, "usage") and output.usage is not None:
+            usage = CompletionUsage(
+                prompt_tokens=output.usage.get("prompt_tokens", 0),
+                completion_tokens=output.usage.get("completion_tokens", 0),
+                total_tokens=output.usage.get("total_tokens", 0),
+            )
+
+        # Extract system_fingerprint (model identifier) if available
+        system_fingerprint = None
+        if hasattr(output, "model") and output.model is not None:
+            system_fingerprint = output.model
+        elif hasattr(output, "provider") and output.provider is not None:
+            system_fingerprint = output.provider
+
         return ChatCompletion(
             id=completion_id,
             model=request.model,
@@ -54,9 +78,12 @@ async def endpoint(request: ChatCompletionRequest) -> ChatCompletion:
                     message=ChatCompletionMessage(
                         content=output.value, role="assistant"
                     ),
+                    finish_reason="stop",
                 )
             ],
             object="chat.completion",  # type: ignore
+            system_fingerprint=system_fingerprint,
+            usage=usage,
         )  # type: ignore
 
     endpoint.__name__ = f"chat_{module.__name__}_endpoint"
diff --git a/cli/serve/models.py b/cli/serve/models.py
index 9d9a7cc51..1888e2d00 100644
--- a/cli/serve/models.py
+++ b/cli/serve/models.py
@@ -82,6 +82,23 @@ class Choice(BaseModel):
     message: ChatCompletionMessage
     """A chat completion message generated by the model."""
 
+    finish_reason: (
+        Literal["stop", "length", "content_filter", "tool_calls", "function_call"]
+        | None
+    ) = "stop"
+    """The reason the model stopped generating tokens."""
+
+
+class CompletionUsage(BaseModel):
+    completion_tokens: int
+    """Number of tokens in the generated completion."""
+
+    prompt_tokens: int
+    """Number of tokens in the prompt."""
+
+    total_tokens: int
+    """Total number of tokens used in the request (prompt + completion)."""
+
 
 class ChatCompletion(BaseModel):
     id: str
@@ -101,3 +118,9 @@ class ChatCompletion(BaseModel):
 
     object: Literal["chat.completion"]
     """The object type, which is always `chat.completion`."""
+
+    system_fingerprint: str | None = None
+    """This fingerprint represents the backend configuration that the model runs with."""
+
+    usage: CompletionUsage | None = None
+    """Usage statistics for the completion request."""
diff --git a/test/cli/test_serve.py b/test/cli/test_serve.py
new file mode 100644
index 000000000..1d658138c
--- /dev/null
+++ b/test/cli/test_serve.py
@@ -0,0 +1,243 @@
+"""Tests for the m serve OpenAI-compatible API server."""
+
+from unittest.mock import Mock, patch
+
+import pytest
+
+from cli.serve.app import make_chat_endpoint
+from cli.serve.models import (
+    ChatCompletion,
+    ChatCompletionRequest,
+    ChatMessage,
+    CompletionUsage,
+)
+from mellea.core.base import ModelOutputThunk
+
+
+@pytest.fixture
+def mock_module():
+    """Create a mock module with a serve function."""
+    module = Mock()
+    module.__name__ = "test_module"
+    return module
+
+
+@pytest.fixture
+def sample_request():
+    """Create a sample ChatCompletionRequest."""
+    return ChatCompletionRequest(
+        model="test-model",
+        messages=[ChatMessage(role="user", content="Hello, world!")],
+        temperature=0.7,
+        max_tokens=100,
+    )
+
+
+class TestChatEndpoint:
+    """Tests for the chat completion endpoint."""
+
+    @pytest.mark.asyncio
+    async def test_basic_completion(self, mock_module, sample_request):
+        """Test basic chat completion returns correct structure."""
+        # Setup mock output
+        mock_output = ModelOutputThunk("Hello! How can I help you?")
+        mock_module.serve.return_value = mock_output
+
+        # Create endpoint and call it
+        endpoint = make_chat_endpoint(mock_module)
+        response = await endpoint(sample_request)
+
+        # Verify response structure
+        assert isinstance(response, ChatCompletion)
+        assert response.model == "test-model"
+        assert len(response.choices) == 1
+        assert response.choices[0].message.content == "Hello! How can I help you?"
+        assert response.choices[0].message.role == "assistant"
+        assert response.choices[0].index == 0
+
+    @pytest.mark.asyncio
+    async def test_finish_reason_included(self, mock_module, sample_request):
+        """Test that finish_reason is included in the response."""
+        mock_output = ModelOutputThunk("Test response")
+        mock_module.serve.return_value = mock_output
+
+        endpoint = make_chat_endpoint(mock_module)
+        response = await endpoint(sample_request)
+
+        assert response.choices[0].finish_reason == "stop"
+
+    @pytest.mark.asyncio
+    async def test_usage_field_populated(self, mock_module, sample_request):
+        """Test that usage field is populated when available."""
+        mock_output = ModelOutputThunk("Test response")
+        mock_output.usage = {
+            "prompt_tokens": 10,
+            "completion_tokens": 5,
+            "total_tokens": 15,
+        }
+        mock_module.serve.return_value = mock_output
+
+        endpoint = make_chat_endpoint(mock_module)
+        response = await endpoint(sample_request)
+
+        assert response.usage is not None
+        assert isinstance(response.usage, CompletionUsage)
+        assert response.usage.prompt_tokens == 10
+        assert response.usage.completion_tokens == 5
+        assert response.usage.total_tokens == 15
+
+    @pytest.mark.asyncio
+    async def test_usage_field_none_when_unavailable(self, mock_module, sample_request):
+        """Test that usage field is None when not available."""
+        mock_output = ModelOutputThunk("Test response")
+        # Don't set usage field
+        mock_module.serve.return_value = mock_output
+
+        endpoint = make_chat_endpoint(mock_module)
+        response = await endpoint(sample_request)
+
+        assert response.usage is None
+
+    @pytest.mark.asyncio
+    async def test_system_fingerprint_from_model(self, mock_module, sample_request):
+        """Test that system_fingerprint is populated from model field."""
+        mock_output = ModelOutputThunk("Test response")
+        mock_output.model = "gpt-4-turbo"
+        mock_module.serve.return_value = mock_output
+
+        endpoint = make_chat_endpoint(mock_module)
+        response = await endpoint(sample_request)
+
+        assert response.system_fingerprint == "gpt-4-turbo"
+
+    @pytest.mark.asyncio
+    async def test_system_fingerprint_from_provider(self, mock_module, sample_request):
+        """Test that system_fingerprint falls back to provider field."""
+        mock_output = ModelOutputThunk("Test response")
+        mock_output.provider = "ollama"
+        mock_module.serve.return_value = mock_output
+
+        endpoint = make_chat_endpoint(mock_module)
+        response = await endpoint(sample_request)
+
+        assert response.system_fingerprint == "ollama"
+
+    @pytest.mark.asyncio
+    async def test_system_fingerprint_none_when_unavailable(
+        self, mock_module, sample_request
+    ):
+        """Test that system_fingerprint is None when not available."""
+        mock_output = ModelOutputThunk("Test response")
+        # Don't set model or provider
+        mock_module.serve.return_value = mock_output
+
+        endpoint = make_chat_endpoint(mock_module)
+        response = await endpoint(sample_request)
+
+        assert response.system_fingerprint is None
+
+    @pytest.mark.asyncio
+    async def test_model_options_passed_correctly(self, mock_module, sample_request):
+        """Test that model options are passed to serve function correctly."""
+        mock_output = ModelOutputThunk("Test response")
+        mock_module.serve.return_value = mock_output
+
+        endpoint = make_chat_endpoint(mock_module)
+        await endpoint(sample_request)
+
+        # Verify serve was called with correct arguments
+        call_args = mock_module.serve.call_args
+        assert call_args is not None
+        assert "model_options" in call_args.kwargs
+        model_options = call_args.kwargs["model_options"]
+
+        # Should include temperature and max_tokens but not messages/requirements
+        assert "temperature" in model_options
+        assert model_options["temperature"] == 0.7
+        assert "max_tokens" in model_options
+        assert model_options["max_tokens"] == 100
+        assert "messages" not in model_options
+        assert "requirements" not in model_options
+
+    @pytest.mark.asyncio
+    async def test_completion_id_format(self, mock_module, sample_request):
+        """Test that completion ID follows OpenAI format."""
+        mock_output = ModelOutputThunk("Test response")
+        mock_module.serve.return_value = mock_output
+
+        endpoint = make_chat_endpoint(mock_module)
+        response = await endpoint(sample_request)
+
+        # Should start with "chatcmpl-" and be 38 characters total (chatcmpl- + 29 hex chars)
+        assert response.id.startswith("chatcmpl-")
+        assert len(response.id) == 38
+
+    @pytest.mark.asyncio
+    async def test_created_timestamp_present(self, mock_module, sample_request):
+        """Test that created timestamp is present and reasonable."""
+        mock_output = ModelOutputThunk("Test response")
+        mock_module.serve.return_value = mock_output
+
+        endpoint = make_chat_endpoint(mock_module)
+        response = await endpoint(sample_request)
+
+        # Should be a Unix timestamp (positive integer)
+        assert isinstance(response.created, int)
+        assert response.created > 0
+
+    @pytest.mark.asyncio
+    async def test_object_type_correct(self, mock_module, sample_request):
+        """Test that object type is set correctly."""
+        mock_output = ModelOutputThunk("Test response")
+        mock_module.serve.return_value = mock_output
+
+        endpoint = make_chat_endpoint(mock_module)
+        response = await endpoint(sample_request)
+
+        assert response.object == "chat.completion"
+
+    @pytest.mark.asyncio
+    async def test_usage_with_partial_data(self, mock_module, sample_request):
+        """Test that usage handles missing fields gracefully."""
+        mock_output = ModelOutputThunk("Test response")
+        # Only provide some fields
+        mock_output.usage = {
+            "prompt_tokens": 10
+            # Missing completion_tokens and total_tokens
+        }
+        mock_module.serve.return_value = mock_output
+
+        endpoint = make_chat_endpoint(mock_module)
+        response = await endpoint(sample_request)
+
+        assert response.usage is not None
+        assert response.usage.prompt_tokens == 10
+        assert response.usage.completion_tokens == 0  # Should default to 0
+        assert response.usage.total_tokens == 0
+
+    @pytest.mark.asyncio
+    async def test_all_fields_together(self, mock_module, sample_request):
+        """Test that all new fields work together correctly."""
+        mock_output = ModelOutputThunk("Complete response")
+        mock_output.usage = {
+            "prompt_tokens": 20,
+            "completion_tokens": 10,
+            "total_tokens": 30,
+        }
+        mock_output.model = "gpt-4"
+        mock_output.provider = "openai"
+        mock_module.serve.return_value = mock_output
+
+        endpoint = make_chat_endpoint(mock_module)
+        response = await endpoint(sample_request)
+
+        # Verify all fields are present
+        assert response.choices[0].finish_reason == "stop"
+        assert response.usage is not None
+        assert response.usage.total_tokens == 30
+        assert response.system_fingerprint == "gpt-4"  # model takes precedence
+        assert response.object == "chat.completion"
+        assert response.id.startswith("chatcmpl-")
+
+
+# Made with Bob

From 97356b0f6128618a9ce6564cb9ef0a222190dfb7 Mon Sep 17 00:00:00 2001
From: Mark Sturdevant <mark.sturdevant@ibm.com>
Date: Wed, 1 Apr 2026 12:46:40 -0700
Subject: [PATCH 2/6] Remove made with Bob comment

Signed-off-by: Mark Sturdevant <mark.sturdevant@ibm.com>
---
 test/cli/test_serve.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/test/cli/test_serve.py b/test/cli/test_serve.py
index 1d658138c..db23d6f41 100644
--- a/test/cli/test_serve.py
+++ b/test/cli/test_serve.py
@@ -238,6 +238,3 @@ async def test_all_fields_together(self, mock_module, sample_request):
         assert response.system_fingerprint == "gpt-4"  # model takes precedence
         assert response.object == "chat.completion"
         assert response.id.startswith("chatcmpl-")
-
-
-# Made with Bob

From 9649315812cb15e941a0809945e838f9d471165e Mon Sep 17 00:00:00 2001
From: Mark Sturdevant <mark.sturdevant@ibm.com>
Date: Wed, 1 Apr 2026 13:39:55 -0700
Subject: [PATCH 3/6] fix(serve): set system_fingerprint to None per OpenAI
 spec

system_fingerprint should be a backend config hash, not the model name.
The model name is already in response.model. Set to None since we don't
currently track backend config fingerprints.

Signed-off-by: Mark Sturdevant <mark.sturdevant@ibm.com>
---
 cli/serve/app.py       |  8 +++-----
 test/cli/test_serve.py | 41 ++++++++++++-----------------------------
 2 files changed, 15 insertions(+), 34 deletions(-)

diff --git a/cli/serve/app.py b/cli/serve/app.py
index d948225b1..67077583f 100644
--- a/cli/serve/app.py
+++ b/cli/serve/app.py
@@ -61,12 +61,10 @@ async def endpoint(request: ChatCompletionRequest) -> ChatCompletion:
                 total_tokens=output.usage.get("total_tokens", 0),
             )
 
-        # Extract system_fingerprint (model identifier) if available
+        # system_fingerprint represents backend config hash, not model name
+        # The model name is already in response.model (line 73)
+        # Leave as None since we don't track backend config fingerprints yet
         system_fingerprint = None
-        if hasattr(output, "model") and output.model is not None:
-            system_fingerprint = output.model
-        elif hasattr(output, "provider") and output.provider is not None:
-            system_fingerprint = output.provider
 
         return ChatCompletion(
             id=completion_id,
diff --git a/test/cli/test_serve.py b/test/cli/test_serve.py
index db23d6f41..bcaaf2a5a 100644
--- a/test/cli/test_serve.py
+++ b/test/cli/test_serve.py
@@ -99,42 +99,25 @@ async def test_usage_field_none_when_unavailable(self, mock_module, sample_reque
         assert response.usage is None
 
     @pytest.mark.asyncio
-    async def test_system_fingerprint_from_model(self, mock_module, sample_request):
-        """Test that system_fingerprint is populated from model field."""
-        mock_output = ModelOutputThunk("Test response")
-        mock_output.model = "gpt-4-turbo"
-        mock_module.serve.return_value = mock_output
-
-        endpoint = make_chat_endpoint(mock_module)
-        response = await endpoint(sample_request)
-
-        assert response.system_fingerprint == "gpt-4-turbo"
-
-    @pytest.mark.asyncio
-    async def test_system_fingerprint_from_provider(self, mock_module, sample_request):
-        """Test that system_fingerprint falls back to provider field."""
-        mock_output = ModelOutputThunk("Test response")
-        mock_output.provider = "ollama"
-        mock_module.serve.return_value = mock_output
-
-        endpoint = make_chat_endpoint(mock_module)
-        response = await endpoint(sample_request)
+    async def test_system_fingerprint_always_none(self, mock_module, sample_request):
+        """Test that system_fingerprint is always None.
 
-        assert response.system_fingerprint == "ollama"
-
-    @pytest.mark.asyncio
-    async def test_system_fingerprint_none_when_unavailable(
-        self, mock_module, sample_request
-    ):
-        """Test that system_fingerprint is None when not available."""
+        Per OpenAI spec, system_fingerprint represents a hash of backend config,
+        not the model name. The model name is in response.model.
+        We don't currently track backend config fingerprints.
+        """
         mock_output = ModelOutputThunk("Test response")
-        # Don't set model or provider
+        mock_output.model = "gpt-4-turbo"
+        mock_output.provider = "openai"
         mock_module.serve.return_value = mock_output
 
         endpoint = make_chat_endpoint(mock_module)
         response = await endpoint(sample_request)
 
+        # system_fingerprint should be None, not the model name
         assert response.system_fingerprint is None
+        # Model name should be in the model field
+        assert response.model == sample_request.model
 
     @pytest.mark.asyncio
     async def test_model_options_passed_correctly(self, mock_module, sample_request):
@@ -235,6 +218,6 @@ async def test_all_fields_together(self, mock_module, sample_request):
         assert response.choices[0].finish_reason == "stop"
         assert response.usage is not None
         assert response.usage.total_tokens == 30
-        assert response.system_fingerprint == "gpt-4"  # model takes precedence
+        assert response.system_fingerprint is None  # Not tracking backend config
         assert response.object == "chat.completion"
         assert response.id.startswith("chatcmpl-")

From 43b96565b666f02e31ebb37c7a44d3cbea9796d8 Mon Sep 17 00:00:00 2001
From: Mark Sturdevant <mark.sturdevant@ibm.com>
Date: Wed, 1 Apr 2026 13:49:36 -0700
Subject: [PATCH 4/6] fix: remove unused import

Signed-off-by: Mark Sturdevant <mark.sturdevant@ibm.com>
---
 test/cli/test_serve.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/cli/test_serve.py b/test/cli/test_serve.py
index bcaaf2a5a..3c76ed3dc 100644
--- a/test/cli/test_serve.py
+++ b/test/cli/test_serve.py
@@ -1,6 +1,6 @@
 """Tests for the m serve OpenAI-compatible API server."""
 
-from unittest.mock import Mock, patch
+from unittest.mock import Mock
 
 import pytest
 

From cbe3eac21294ef497a459b3469f84aa22261966e Mon Sep 17 00:00:00 2001
From: Mark Sturdevant <mark.sturdevant@ibm.com>
Date: Wed, 1 Apr 2026 14:27:21 -0700
Subject: [PATCH 5/6] fix: calculate total_tokens from prompt + completion when
 missing

When partial usage data is provided (e.g., only prompt_tokens),
total_tokens was incorrectly defaulting to 0. Now it's calculated
as prompt_tokens + completion_tokens when not explicitly provided.

This prevents silent bad values from going out over the OpenAI-compatible API.

- Modified cli/serve/app.py to calculate total_tokens when missing
- Updated test_usage_with_partial_data to expect correct behavior

Signed-off-by: Mark Sturdevant <mark.sturdevant@ibm.com>
---
 cli/serve/app.py       | 12 +++++++++---
 test/cli/test_serve.py |  4 +++-
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/cli/serve/app.py b/cli/serve/app.py
index 67077583f..50b4777b6 100644
--- a/cli/serve/app.py
+++ b/cli/serve/app.py
@@ -55,10 +55,16 @@ async def endpoint(request: ChatCompletionRequest) -> ChatCompletion:
         # Extract usage information from the ModelOutputThunk if available
         usage = None
         if hasattr(output, "usage") and output.usage is not None:
+            prompt_tokens = output.usage.get("prompt_tokens", 0)
+            completion_tokens = output.usage.get("completion_tokens", 0)
+            # Calculate total_tokens if not provided
+            total_tokens = output.usage.get(
+                "total_tokens", prompt_tokens + completion_tokens
+            )
             usage = CompletionUsage(
-                prompt_tokens=output.usage.get("prompt_tokens", 0),
-                completion_tokens=output.usage.get("completion_tokens", 0),
-                total_tokens=output.usage.get("total_tokens", 0),
+                prompt_tokens=prompt_tokens,
+                completion_tokens=completion_tokens,
+                total_tokens=total_tokens,
             )
 
         # system_fingerprint represents backend config hash, not model name
diff --git a/test/cli/test_serve.py b/test/cli/test_serve.py
index 3c76ed3dc..85f3bd9f1 100644
--- a/test/cli/test_serve.py
+++ b/test/cli/test_serve.py
@@ -196,7 +196,9 @@ async def test_usage_with_partial_data(self, mock_module, sample_request):
         assert response.usage is not None
         assert response.usage.prompt_tokens == 10
         assert response.usage.completion_tokens == 0  # Should default to 0
-        assert response.usage.total_tokens == 0
+        assert (
+            response.usage.total_tokens == 10
+        )  # Should be prompt_tokens + completion_tokens
 
     @pytest.mark.asyncio
     async def test_all_fields_together(self, mock_module, sample_request):

From 191a0c802200a36b18e2a9e6fd35c939c6e1afcd Mon Sep 17 00:00:00 2001
From: Mark Sturdevant <mark.sturdevant@ibm.com>
Date: Wed, 1 Apr 2026 14:38:12 -0700
Subject: [PATCH 6/6] test: make completion ID format test less brittle

Replace hardcoded length assertion with implementation-agnostic check.
The test now validates that the ID has the correct prefix and a non-empty
suffix, without coupling to specific length or format details.

Signed-off-by: Mark Sturdevant <mark.sturdevant@ibm.com>
---
 test/cli/test_serve.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/cli/test_serve.py b/test/cli/test_serve.py
index 85f3bd9f1..584be0876 100644
--- a/test/cli/test_serve.py
+++ b/test/cli/test_serve.py
@@ -151,9 +151,9 @@ async def test_completion_id_format(self, mock_module, sample_request):
         endpoint = make_chat_endpoint(mock_module)
         response = await endpoint(sample_request)
 
-        # Should start with "chatcmpl-" and be 38 characters total (chatcmpl- + 29 hex chars)
+        # Should start with "chatcmpl-" and have a non-empty suffix
         assert response.id.startswith("chatcmpl-")
-        assert len(response.id) == 38
+        assert len(response.id) > len("chatcmpl-"), "ID should have a suffix"
 
     @pytest.mark.asyncio
     async def test_created_timestamp_present(self, mock_module, sample_request):