fix: Improved LangChain serialization (NVIDIA#165)

dagardner-nv · web-flow · commit a3c552bb9adf · 2026-05-28T18:33:45.000Z
#### Overview Fix LangChain serialization, by adding a LangChain specific codec - [x] I confirm this contribution is my own work, or I have the right to submit it under this project's license. - [x] I searched existing issues and open pull requests, and this does not duplicate existing work. #### Details * This fixes the ability to use LLM intercepts * First pass at documenting the immutability of `LLMRequest` and the immutability of `AnnotatedLLMRequest` (I need to go back and updagte this for the other language bindings) #### Where should the reviewer start? `python/nemo_relay/integrations/langchain/_serialization.py` #### Related Issues: (use one of the action keywords Closes / Fixes / Resolves / Relates to) - Closes # ## Summary by CodeRabbit * **Documentation** * Clarified that LLM request objects are immutable; examples now show returning new request instances instead of mutating originals. * **Integrations** * LangChain integration reworked to use a dedicated codec for reliable request/response translation, role normalization, tool-call handling, and preservation of extra fields. * **Tests** * Expanded LangChain and middleware tests, including codec round-trip and interceptor behavior; added an end-to-end agent integration test. [![Review Change Stack](https://storage.googleapis.com/coderabbit_public_assets/review-stack-in-coderabbit-ui.svg)](https://app.coderabbit.ai/change-stack/NVIDIA/NeMo-Relay/pull/165?utm_source=github_walkthrough&utm_medium=github&utm_campaign=change_stack) Authors: - David Gardner (https://github.com/dagardner-nv) Approvers: - Will Killian (https://github.com/willkill07) URL: NVIDIA#165
diff --git a/docs/build-plugins/code-examples.mdx b/docs/build-plugins/code-examples.mdx
@@ -13,6 +13,8 @@ This page collects concrete examples for the surrounding guide area.
 
 Use an LLM request intercept when a plugin needs to inject tenant or routing metadata into every provider request.
 
+LLM request intercepts receive three arguments: `name`, `request`, and `annotated`. The `request` object is immutable, however it is possible to return a new instance of the request with edits, the exception to this is when the intercept is written in Rust.
+
 <Tabs>
 <Tab title="Python" language="python">
 ```python
@@ -30,8 +32,9 @@ class HeaderPlugin:
 
     def register(self, plugin_config, context):
         def add_header(name, request, annotated):
-            request.headers[plugin_config["header_name"]] = plugin_config["value"]
-            return request, annotated
+            headers = request.headers.copy()
+            headers[plugin_config["header_name"]] = plugin_config["value"]
+            return nemo_relay.LLMRequest(headers=headers, content=request.content), annotated
 
         context.register_llm_request_intercept("inject-header", 100, False, add_header)
 
diff --git a/docs/build-plugins/register-behavior.mdx b/docs/build-plugins/register-behavior.mdx
@@ -122,8 +122,9 @@ class HeaderPlugin:
 
     def register(self, plugin_config, context):
         def add_header(name, request, annotated):
-            request.headers[plugin_config["header_name"]] = plugin_config["value"]
-            return request, annotated
+            headers = request.headers.copy()
+            headers[plugin_config["header_name"]] = plugin_config["value"]
+            return nemo_relay.LLMRequest(headers=headers, content=request.content), annotated
 
         context.register_llm_request_intercept("inject-header", 100, False, add_header)
 
diff --git a/docs/integrate-into-frameworks/provider-codecs.mdx b/docs/integrate-into-frameworks/provider-codecs.mdx
@@ -89,6 +89,9 @@ def add_system_message(_name, request, annotated):
     if annotated is None:
         return request, annotated
 
+    # Attributes of the annotated request can be re-assigned, but cannot be modified in-place.
+    # For example `annotated.messages.append(...)` would not work, but re-assigning
+    # `annotated.messages = annotated.messages + [...]` does work.
     annotated.messages = [
         {"role": "system", "content": "Answer with concise technical detail."},
         *annotated.messages,
diff --git a/python/nemo_relay/__init__.py b/python/nemo_relay/__init__.py
@@ -38,8 +38,9 @@ def redact_args(tool_name, args):
         return {**args, "api_key": "***"}
 
     def add_header(name, request, annotated):
-        request.headers["Authorization"] = "Bearer test-token"
-        return request, annotated
+        headers = request.headers.copy()
+        headers["Authorization"] = "Bearer test-token"
+        return nemo_relay.LLMRequest(headers=headers, content=request.content), annotated
 
     async def tool_impl(args):
         return {"echo": args["query"]}
diff --git a/python/nemo_relay/integrations/langchain/_serialization.py b/python/nemo_relay/integrations/langchain/_serialization.py
@@ -5,51 +5,33 @@
 
 from __future__ import annotations
 
+import json
 from typing import TYPE_CHECKING, Any
 
 from langchain.agents.middleware import ModelResponse
 from langchain_core.messages import (
+    AIMessage,
     BaseMessage,
+    HumanMessage,
     SystemMessage,
     ToolMessage,
     messages_from_dict,
     messages_to_dict,
 )
 from langgraph.types import Command, Send
 
-from nemo_relay.codecs import AnthropicMessagesCodec, LlmCodec, OpenAIChatCodec, OpenAIResponsesCodec
+from nemo_relay import AnnotatedLLMRequest, LLMRequest
+from nemo_relay.codecs import LlmCodec
 
 if TYPE_CHECKING:
     from langchain.agents.middleware import ModelRequest
 
-
-# In order to infer codec support from LangChain chat model types, we need to import them here.
-# However these may not be installed in the user's environment.
-_HAS_ANTHROPIC = False
-_HAS_OPENAI = False
-_HAS_NVIDIA = False
-try:
-    from langchain_anthropic import ChatAnthropic
-
-    _HAS_ANTHROPIC = True
-except ImportError:
-    pass
-
-try:
-    from langchain_openai import ChatOpenAI
-
-    _HAS_OPENAI = True
-except ImportError:
-    pass
-
-try:
-    from langchain_nvidia_ai_endpoints import ChatNVIDIA
-
-    _HAS_NVIDIA = True
-except ImportError:
-    pass
-
 LANGCHAIN_MODEL_RESPONSE_KEY = "__nemo_relay_integrations_langchain_model_response"
+_LANGCHAIN_MODELED_REQUEST_KEYS = {"messages", "model", "tool_choice", "tools"}
+_LC_TO_RELAY_MESSAGE_ROLE = {
+    "human": "user",
+    "ai": "assistant",
+}
 
 
 def get_model_name(model: Any) -> str | None:
@@ -61,24 +43,156 @@ def get_model_name(model: Any) -> str | None:
     return None
 
 
-def infer_codec_from_model(model: Any) -> LlmCodec | None:
-    """Infer a NeMo Relay codec name from a LangChain chat model."""
-    if _HAS_ANTHROPIC:
-        if isinstance(model, ChatAnthropic):
-            return AnthropicMessagesCodec()
-
-    if _HAS_NVIDIA:
-        if isinstance(model, ChatNVIDIA):
-            return OpenAIChatCodec()
-
-    if _HAS_OPENAI:
-        if isinstance(model, ChatOpenAI):
-            if getattr(model, "use_responses_api", None) is True:
-                return OpenAIResponsesCodec()
-
-            return OpenAIChatCodec()
-
-    return None
+class LangChainCodec(LlmCodec):
+    """Translate LangChain ``ModelRequest`` payloads for request intercepts."""
+
+    @classmethod
+    def _langchain_tool_calls_to_annotated(cls, tool_calls: list[Any]) -> list[dict[str, Any]]:
+        annotated_tool_calls = []
+        for tool_call in tool_calls:
+            args = tool_call["args"]
+            arguments = args if isinstance(args, str) else json.dumps(args)
+            annotated_tool_calls.append(
+                {
+                    "id": tool_call.get("id") or "",
+                    "type": "function",
+                    "function": {
+                        "name": tool_call["name"],
+                        "arguments": arguments,
+                    },
+                }
+            )
+
+        return annotated_tool_calls
+
+    @classmethod
+    def _annotated_tool_calls_to_langchain(cls, tool_calls: Any) -> list[dict[str, Any]] | None:
+        if not isinstance(tool_calls, list) or not tool_calls:
+            return None
+
+        langchain_tool_calls = []
+        for tool_call in tool_calls:
+            if not isinstance(tool_call, dict):
+                continue
+            function = tool_call.get("function")
+            if isinstance(function, dict):
+                name = str(function.get("name") or "")
+                arguments = function.get("arguments", {})
+            else:
+                name = str(tool_call.get("name") or "")
+                arguments = tool_call.get("args", {})
+
+            if isinstance(arguments, str):
+                try:
+                    args = json.loads(arguments)
+                except json.JSONDecodeError:
+                    args = {"arguments": arguments}
+            elif isinstance(arguments, dict):
+                args = arguments
+            else:
+                args = {}
+
+            langchain_tool_calls.append(
+                {
+                    "name": name,
+                    "args": args,
+                    "id": str(tool_call.get("id") or ""),
+                    "type": "tool_call",
+                }
+            )
+
+        return langchain_tool_calls or None
+
+    @classmethod
+    def _langchain_message_to_annotated(cls, message: BaseMessage) -> list[dict[str, Any]]:
+        content = message.content
+        if content is None:
+            content = []
+        elif isinstance(content, str):
+            content = [content]
+
+        name = message.name
+        role = _LC_TO_RELAY_MESSAGE_ROLE.get(message.type, message.type)
+
+        messages = []
+        for msg in content:
+            relay_message: dict[str, Any] = {"role": role}
+            if isinstance(msg, str):
+                relay_message["content"] = msg
+            elif isinstance(msg, dict):
+                relay_message.update(msg)
+                if "content" not in relay_message:
+                    relay_message["content"] = relay_message.pop("text", "")
+            else:
+                raise ValueError(f"Unsupported LangChain message content type: {type(content)}")
+
+            if name is not None:
+                relay_message["name"] = name
+
+            # Using getattr as we are inferring subclasses of BaseMessage based upon the role
+            if role == "assistant":
+                tool_calls = getattr(message, "tool_calls", [])
+                relay_message["tool_calls"] = cls._langchain_tool_calls_to_annotated(tool_calls)
+            elif role == "tool":
+                relay_message["tool_call_id"] = getattr(message, "tool_call_id", "")
+
+            messages.append(relay_message)
+
+        return messages
+
+    @classmethod
+    def _annotated_message_to_langchain(cls, message: dict[str, Any]) -> BaseMessage:
+        role = message.get("role")
+        content = message.get("content", "")
+        name = message.get("name")
+
+        if role == "system":
+            return SystemMessage(content=content, name=name)
+        if role == "user":
+            return HumanMessage(content=content, name=name)
+        if role == "assistant":
+            tool_calls = cls._annotated_tool_calls_to_langchain(message.get("tool_calls"))
+            return AIMessage(content=content, name=name, tool_calls=tool_calls or [])
+        if role == "tool":
+            return ToolMessage(content=content, name=name, tool_call_id=str(message.get("tool_call_id") or ""))
+        raise ValueError(f"Unsupported annotated LangChain message role: {role!r}")
+
+    def decode(self, request: LLMRequest) -> AnnotatedLLMRequest:
+        """Decode a LangChain-shaped request payload into an annotated request."""
+        payload = request.content
+        raw_messages = payload.get("messages", [])
+        messages: list[dict[str, Any]] = []
+        if isinstance(raw_messages, list):
+            for message in messages_from_dict(raw_messages):
+                messages.extend(self._langchain_message_to_annotated(message))
+
+        model = payload.get("model")
+        tools = payload.get("tools")
+        tool_choice = payload.get("tool_choice")
+        extra = {key: value for key, value in payload.items() if key not in _LANGCHAIN_MODELED_REQUEST_KEYS}
+
+        return AnnotatedLLMRequest(
+            messages,
+            model=model if isinstance(model, str) else None,
+            tools=tools if isinstance(tools, list) else None,
+            tool_choice=tool_choice if isinstance(tool_choice, str | dict) else None,
+            extra=extra or None,
+        )
+
+    def encode(self, annotated: AnnotatedLLMRequest, original: LLMRequest) -> LLMRequest:
+        """Encode annotated request edits back into a LangChain-shaped payload."""
+        payload = dict(original.content)
+        payload.update(annotated.extra)
+        payload["messages"] = messages_to_dict(
+            [self._annotated_message_to_langchain(message) for message in annotated.messages]
+        )
+        if annotated.model is not None:
+            payload["model"] = annotated.model
+        if annotated.tools is not None:
+            payload["tools"] = annotated.tools
+        if annotated.tool_choice is not None:
+            payload["tool_choice"] = annotated.tool_choice
+        return LLMRequest(dict(original.headers), payload)
 
 
 def split_system_message(messages: list[BaseMessage]) -> tuple[SystemMessage | None, list[BaseMessage]]:
@@ -109,12 +223,12 @@ def model_request_to_payload(model_name: str | None, request: ModelRequest[Any])
 
 def payload_to_model_request(
     original: ModelRequest[Any],
-    payload: dict[str, Any],
+    llm_request: LLMRequest,
 ) -> ModelRequest[Any]:
     """Apply supported NeMo Relay request-intercept edits back to ``ModelRequest``."""
     overrides: dict[str, Any] = {}
 
-    raw_messages = payload.get("messages")
+    raw_messages = llm_request.content.get("messages")
     if isinstance(raw_messages, list) and len(raw_messages) > 0:
         try:
             system_message, messages = split_system_message(messages_from_dict(raw_messages))
@@ -123,12 +237,24 @@ def payload_to_model_request(
         except Exception:
             pass
 
-    model_settings = payload.get("model_settings")
+    model_settings = llm_request.content.get("model_settings")
     if isinstance(model_settings, dict):
-        overrides["model_settings"] = model_settings
+        # Using dict() to ensure we have a copy
+        model_settings_copy = dict(model_settings)
+        extra_headers = model_settings_copy.get("extra_headers")
+        if not isinstance(extra_headers, dict):
+            extra_headers = {}
+        overrides["model_settings"] = model_settings_copy
+    else:
+        overrides["model_settings"] = {}
+        extra_headers = {}
+
+    if len(llm_request.headers) > 0:
+        extra_headers.update(llm_request.headers)
+        overrides["model_settings"]["extra_headers"] = extra_headers
 
-    if "tool_choice" in payload:
-        overrides["tool_choice"] = payload["tool_choice"]
+    if "tool_choice" in llm_request.content:
+        overrides["tool_choice"] = llm_request.content["tool_choice"]
 
     return original.override(**overrides) if overrides else original
 
diff --git a/python/nemo_relay/integrations/langchain/middleware.py b/python/nemo_relay/integrations/langchain/middleware.py
@@ -12,8 +12,8 @@
 
 import nemo_relay
 from nemo_relay.integrations.langchain._serialization import (
+    LangChainCodec,
     get_model_name,
-    infer_codec_from_model,
     model_request_to_payload,
     model_response_from_json,
     model_response_to_json,
@@ -72,7 +72,7 @@ def _prepare_model_call(self, request: ModelRequest[Any]) -> tuple:
         object_codec = nemo_relay.typed.BestEffortAnyCodec()
         model_name = get_model_name(request.model)
         llm_request = nemo_relay.LLMRequest({}, model_request_to_payload(model_name, request))
-        model_codec = infer_codec_from_model(request.model)
+        model_codec = LangChainCodec()
         return (object_codec, llm_request, model_name, model_codec)
 
     def wrap_model_call(
@@ -83,8 +83,8 @@ def wrap_model_call(
         """Wrap a sync LangChain agent model call in NeMo Relay LLM execution."""
         (object_codec, llm_request, model_name, model_codec) = self._prepare_model_call(request)
 
-        async def _call(req: Any) -> Any:
-            response = handler(payload_to_model_request(request, req.content))
+        async def _call(req: nemo_relay.LLMRequest) -> Any:
+            response = handler(payload_to_model_request(request, req))
             return model_response_to_json(response, object_codec)
 
         result = run_sync(
@@ -106,8 +106,8 @@ async def awrap_model_call(
         """Wrap an async LangChain agent model call in NeMo Relay LLM execution."""
         (object_codec, llm_request, model_name, model_codec) = self._prepare_model_call(request)
 
-        async def _call(req: Any) -> Any:
-            response = await handler(payload_to_model_request(request, req.content))
+        async def _call(req: nemo_relay.LLMRequest) -> Any:
+            response = await handler(payload_to_model_request(request, req))
             return model_response_to_json(response, object_codec)
 
         result = await self._llm_execute(
diff --git a/python/nemo_relay/intercepts.py b/python/nemo_relay/intercepts.py
diff --git a/python/tests/integrations/deepagents_tests/test_deepagents_integration.py b/python/tests/integrations/deepagents_tests/test_deepagents_integration.py
diff --git a/python/tests/integrations/langchain_tests/test_middleware.py b/python/tests/integrations/langchain_tests/test_middleware.py