fix: ixp extraction tool

cristian-groza · cristian-groza · commit 91157606c2da · 2026-05-04T22:30:53.000+03:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "uipath-langchain"
-version = "0.10.11"
+version = "0.10.12"
 description = "Python SDK that enables developers to build and deploy LangGraph agents to the UiPath Cloud Platform"
 readme = { file = "README.md", content-type = "text/markdown" }
 requires-python = ">=3.11"
diff --git a/src/uipath_langchain/agent/tools/extraction_tool.py b/src/uipath_langchain/agent/tools/extraction_tool.py
@@ -1,14 +1,15 @@
 """Ixp extraction tool."""
 
-from typing import Any
+import uuid
+from typing import Any, Optional
 
 from langchain.tools import BaseTool
 from langchain_core.messages import ToolCall, ToolMessage
 from langchain_core.tools import StructuredTool
 from langgraph.types import Command, interrupt
+from pydantic import BaseModel, Field
 from uipath.agent.models.agent import AgentIxpExtractionResourceConfig
 from uipath.eval.mocks import mockable
-from uipath.platform.attachments import Attachment
 from uipath.platform.common import DocumentExtraction
 from uipath.platform.documents import ExtractionResponseIXP
 
@@ -26,6 +27,34 @@ class StructuredToolWithWrapper(StructuredToolWithOutputType, ToolWrapperMixin):
     pass
 
 
+class ExtractionToolInputSchema(BaseModel):
+    """Alias-free mirror of `Attachment` used as the tool's args_schema.
+
+    We don't use `Attachment` directly because its fields carry aliases
+    (`id` -> `ID`, `full_name` -> `FullName`, ...) and LangChain mishandles
+    aliased fields in two places (see PR #796):
+
+    1. `BaseTool._parse_input()` extracts each field with `getattr(model, key)`,
+       where `key` is the alias. For aliases that collide with built-in model
+       attributes (e.g. `schema`), this returns the built-in instead of the
+       field value, so downstream `kwargs.get("id") / kwargs.get("full_name")`
+       came back as `None`.
+    2. `tool_call_schema` rebuilds a subset of the model by copying each field
+       but drops alias and serialization options, so the rebuilt schema no
+       longer matches what the LLM emits.
+
+    Until LangChain fixes both, exposing an alias-free schema with field
+    names matching `Attachment`'s python names sidesteps the issue. Keep the
+    fields here in sync with `Attachment` — the test
+    `test_extraction_tool_has_attachment_input_schema` enforces this.
+    """
+
+    id: uuid.UUID
+    full_name: str
+    mime_type: str
+    metadata: Optional[dict[str, Any]] = Field(None)
+
+
 def create_ixp_extraction_tool(
     resource: AgentIxpExtractionResourceConfig,
 ) -> StructuredTool:
@@ -38,27 +67,21 @@ def create_ixp_extraction_tool(
     @mockable(
         name=resource.name,
         description=resource.description,
-        input_schema=Attachment.model_json_schema(),
+        input_schema=ExtractionToolInputSchema.model_json_schema(),
         output_schema=ExtractionResponseIXP.model_json_schema(),
         example_calls=resource.properties.example_calls,
     )
     async def extraction_tool_fn(**kwargs: Any) -> ExtractionResponseIXP:
         from uipath.platform import UiPath
 
+        attachment = ExtractionToolInputSchema.model_validate(kwargs)
         uipath = UiPath()
 
-        attachment_id = kwargs.get("id")
-        attachment_full_name = kwargs.get("full_name")
-
-        # TODO: attachment_mime_type is currently not used anywhere (attachment_full_name will also be obsolete once attachments api is onboarded)
-        # should we use them somewhere else? otherwise input_schema should only contain the file id
-        # attachment_mime_type = kwargs.get("mime_type")
-
         # TODO: current workaround. DocumentExtraction model should support attachment_id and use the
         # start_ixp_extraction_from_attachment sdk method once support is added
 
         attachment_local_file_path = await uipath.attachments.download_async(
-            key=attachment_id, destination_path=attachment_full_name
+            key=attachment.id, destination_path=attachment.full_name
         )
         document_extraction_response = interrupt(
             DocumentExtraction(
@@ -95,7 +118,7 @@ async def extraction_tool_wrapper(
     tool = StructuredToolWithWrapper(
         name=tool_name,
         description=resource.description,
-        args_schema=Attachment,
+        args_schema=ExtractionToolInputSchema,
         coroutine=extraction_tool_fn,
         output_type=ExtractionResponseIXP,
         metadata={
diff --git a/tests/agent/tools/test_extraction_tool.py b/tests/agent/tools/test_extraction_tool.py
@@ -11,7 +11,10 @@
 from uipath.platform.attachments import Attachment
 from uipath.platform.documents import ExtractionResponseIXP
 
-from uipath_langchain.agent.tools.extraction_tool import create_ixp_extraction_tool
+from uipath_langchain.agent.tools.extraction_tool import (
+    ExtractionToolInputSchema,
+    create_ixp_extraction_tool,
+)
 
 
 class TestExtractionToolMetadata:
@@ -76,10 +79,15 @@ def test_extraction_tool_has_correct_description(self, extraction_resource):
         assert tool.description == "Extract data from files"
 
     def test_extraction_tool_has_attachment_input_schema(self, extraction_resource):
-        """Test that extraction tool uses Attachment as input schema."""
+        """Test that extraction tool's input schema mirrors Attachment fields."""
         tool = create_ixp_extraction_tool(extraction_resource)
 
-        assert tool.args_schema == Attachment
+        schema_fields = tool.args_schema.model_fields
+        attachment_fields = Attachment.model_fields
+
+        assert schema_fields.keys() == attachment_fields.keys()
+        for name, attachment_field in attachment_fields.items():
+            assert schema_fields[name].annotation == attachment_field.annotation
 
     def test_extraction_tool_has_extraction_response_output_type(
         self, extraction_resource
@@ -235,6 +243,39 @@ async def test_extraction_tool_propagates_download_exception(
 
         assert "Download failed" in str(exc_info.value)
 
+    @pytest.mark.asyncio
+    @patch("uipath.platform.UiPath")
+    @patch("uipath_langchain.agent.tools.extraction_tool.interrupt")
+    async def test_extraction_tool_handles_alias_keyed_input(
+        self, mock_interrupt, mock_uipath_class, extraction_resource
+    ):
+        """The LLM emits Attachment fields by alias (ID/FullName/MimeType) — the
+        same shape Attachment.model_dump(by_alias=True) produces. download_async
+        must still be called with the populated UUID, not key=None.
+        """
+        mock_client = MagicMock()
+        mock_uipath_class.return_value = mock_client
+        mock_client.attachments.download_async = AsyncMock(
+            return_value="/path/to/document.pdf"
+        )
+        mock_interrupt.return_value = {"extracted_data": {"field1": "value1"}}
+
+        tool = create_ixp_extraction_tool(extraction_resource)
+
+        attachment = ExtractionToolInputSchema(
+            id=UUID("fa93f4ca-bd3f-473a-93e5-e6e5b5a8f27f"),
+            full_name="document.pdf",
+            mime_type="application/pdf",
+        )
+        aliased_input = attachment.model_dump()
+
+        await tool.ainvoke(aliased_input)
+
+        mock_client.attachments.download_async.assert_called_once_with(
+            key=UUID("fa93f4ca-bd3f-473a-93e5-e6e5b5a8f27f"),
+            destination_path="document.pdf",
+        )
+
 
 class TestExtractionToolNameSanitization:
     """Test that extraction tool names are properly sanitized."""
diff --git a/uv.lock b/uv.lock