Skip to content

Commit 3be15da

Browse files
committed
feat: add tracing for PII file masking
1 parent 74784ac commit 3be15da

5 files changed

Lines changed: 499 additions & 56 deletions

File tree

src/uipath_langchain/agent/multimodal/types.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,6 @@ class FileInfo:
2424
url: str
2525
name: str
2626
mime_type: str
27+
masked_attachment_url: str | None = None
28+
attachment_id: str | None = None
29+
masked_attachment_id: str | None = None

src/uipath_langchain/agent/tools/internal_tools/analyze_files_tool.py

Lines changed: 218 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import json
12
import logging
23
import mimetypes
34
import uuid
@@ -12,14 +13,21 @@
1213
HumanMessage,
1314
SystemMessage,
1415
)
15-
from langchain_core.runnables.config import var_child_runnable_config
16+
from langchain_core.runnables.config import RunnableConfig, var_child_runnable_config
1617
from langchain_core.tools import StructuredTool
18+
from opentelemetry import trace as otel_trace
1719
from uipath.agent.models.agent import (
1820
AgentInternalToolResourceConfig,
1921
)
22+
from uipath.core.tracing.span_utils import UiPathSpanUtils
2023
from uipath.eval.mocks import mockable
2124
from uipath.platform import UiPath
2225
from uipath.runtime.errors import UiPathErrorCategory
26+
from uipath.tracing import (
27+
AttachmentDirection,
28+
AttachmentProvider,
29+
SpanAttachment,
30+
)
2331

2432
from uipath_langchain.agent.exceptions import (
2533
AgentRuntimeError,
@@ -48,6 +56,178 @@
4856
"based on the extracted information."
4957
)
5058

59+
# Langchain config metadata key carrying the JSON-serialized SpanAttachment list
60+
# that should render on the llmCall span. The LLMOps callback in uipath-agents
61+
# reads this and stamps it on the llmCall span as the ``attachments`` attribute.
62+
LLM_CALL_ATTACHMENTS_METADATA_KEY = "uipath_llm_call_attachments"
63+
64+
65+
def _masked_name_for(name: str) -> str:
66+
"""Apply the ``pii_masked_`` filename prefix so the LLM sees redacted bytes under a masked name."""
67+
if "." in name:
68+
base, ext = name.rsplit(".", 1)
69+
return f"pii_masked_{base}.{ext}"
70+
return f"pii_masked_{name}"
71+
72+
73+
def _original_attachment_id(file: FileInfo) -> str:
74+
"""Return the id to use for the original file in trace attachments.
75+
76+
Prefers the orchestrator attachment UUID when present; falls back to a
77+
UUID derived from the file URL for files that did not come from
78+
orchestrator (defensive, should not happen in production paths).
79+
"""
80+
if file.attachment_id:
81+
return file.attachment_id
82+
return str(uuid.uuid5(uuid.NAMESPACE_URL, file.url))
83+
84+
85+
def _masked_attachment_id(masked_url: str) -> str:
86+
"""Derive a stable GUID from the masked URL for trace attachments.
87+
88+
The LLMOps traces endpoint validates ``Attachment.Id`` as ``System.Guid``.
89+
Masked files aren't orchestrator-tracked, so we synthesize a deterministic
90+
UUID from the redacted blob URL to satisfy the schema while keeping the id
91+
stable across re-runs.
92+
"""
93+
return str(uuid.uuid5(uuid.NAMESPACE_URL, masked_url))
94+
95+
96+
def _set_span_attachments(
97+
span: otel_trace.Span, attachments: list[SpanAttachment]
98+
) -> None:
99+
"""Write a :class:`SpanAttachment` list as a JSON string on the given OTel span."""
100+
if not attachments or span is None or not span.is_recording():
101+
return
102+
try:
103+
span.set_attribute(
104+
"attachments",
105+
json.dumps([att.model_dump(by_alias=True) for att in attachments]),
106+
)
107+
except Exception:
108+
logger.exception("Failed to emit trace attachments")
109+
110+
111+
def _llm_call_attachments_payload(files: list[FileInfo]) -> str | None:
112+
"""Build the JSON attachments payload for the llmCall span.
113+
114+
Each entry represents the file version actually sent to the model: the
115+
masked copy when PII masking ran (keyed by the orchestrator UUID from the
116+
re-upload when available, uuid5 fallback otherwise), else the original
117+
orchestrator attachment. Direction is ``IN`` because the file is an input
118+
to the LLM.
119+
"""
120+
if not files:
121+
return None
122+
attachments: list[SpanAttachment] = []
123+
for file in files:
124+
if file.masked_attachment_url:
125+
att_id = file.masked_attachment_id or _masked_attachment_id(
126+
file.masked_attachment_url
127+
)
128+
name = _masked_name_for(file.name)
129+
else:
130+
att_id = _original_attachment_id(file)
131+
name = file.name
132+
attachments.append(
133+
SpanAttachment(
134+
id=att_id,
135+
file_name=name,
136+
mime_type=file.mime_type,
137+
provider=AttachmentProvider.ORCHESTRATOR,
138+
direction=AttachmentDirection.IN,
139+
)
140+
)
141+
return json.dumps([att.model_dump(by_alias=True) for att in attachments])
142+
143+
144+
def _config_with_llm_call_attachments(
145+
config: RunnableConfig | None, files: list[FileInfo]
146+
) -> RunnableConfig | None:
147+
"""Return a runnable config carrying the llmCall attachments payload.
148+
149+
The LLMOps callback in ``uipath-agents`` reads the payload from
150+
``metadata[LLM_CALL_ATTACHMENTS_METADATA_KEY]`` and stamps it as the
151+
``attachments`` attribute on the llmCall span — so the file actually sent
152+
to the model (masked copy when PII masking ran, original otherwise)
153+
renders as a downloadable attachment on the LLM-call boundary in the
154+
trace UI, mirroring how the PII Masking span renders its files.
155+
"""
156+
payload = _llm_call_attachments_payload(files)
157+
if not payload:
158+
return config
159+
new_config = cast(RunnableConfig, dict(config) if config else {})
160+
metadata = dict(new_config.get("metadata") or {})
161+
metadata[LLM_CALL_ATTACHMENTS_METADATA_KEY] = payload
162+
new_config["metadata"] = metadata
163+
return new_config
164+
165+
166+
def _emit_pii_masking_attachments(
167+
span: otel_trace.Span, files: list[FileInfo]
168+
) -> None:
169+
"""Emit originals (IN) and masked copies (OUT) on the given PII Masking span.
170+
171+
Originals are keyed by the orchestrator attachment UUID; masked copies are
172+
keyed by the real orchestrator UUID from the re-upload when available, or
173+
a uuid5 derived from the redacted URL as a fallback.
174+
"""
175+
if not files:
176+
return
177+
attachments: list[SpanAttachment] = []
178+
input_files: list[dict[str, Any]] = []
179+
output_files: list[dict[str, Any]] = []
180+
181+
for file in files:
182+
original_id = _original_attachment_id(file)
183+
attachments.append(
184+
SpanAttachment(
185+
id=original_id,
186+
file_name=file.name,
187+
mime_type=file.mime_type,
188+
provider=AttachmentProvider.ORCHESTRATOR,
189+
direction=AttachmentDirection.IN,
190+
)
191+
)
192+
input_files.append(
193+
{"id": original_id, "fileName": file.name, "mimeType": file.mime_type}
194+
)
195+
196+
if file.masked_attachment_url:
197+
# Prefer the real orchestrator UUID from the re-upload so the UI
198+
# can download the file; fall back to the synthesized uuid5.
199+
masked_id = file.masked_attachment_id or _masked_attachment_id(
200+
file.masked_attachment_url
201+
)
202+
masked_name = _masked_name_for(file.name)
203+
attachments.append(
204+
SpanAttachment(
205+
id=masked_id,
206+
file_name=masked_name,
207+
mime_type=file.mime_type,
208+
provider=AttachmentProvider.ORCHESTRATOR,
209+
direction=AttachmentDirection.OUT,
210+
)
211+
)
212+
output_files.append(
213+
{"id": masked_id, "fileName": masked_name, "mimeType": file.mime_type}
214+
)
215+
216+
_set_span_attachments(span, attachments)
217+
218+
if span is not None and span.is_recording():
219+
try:
220+
input_payload = json.dumps({"files": input_files})
221+
output_payload = json.dumps({"files": output_files})
222+
span.set_attribute("input", input_payload)
223+
span.set_attribute("input.value", input_payload)
224+
span.set_attribute("input.mime_type", "application/json")
225+
span.set_attribute("output", output_payload)
226+
span.set_attribute("output.value", output_payload)
227+
span.set_attribute("output.mime_type", "application/json")
228+
except Exception:
229+
logger.exception("Failed to set PII Masking input/output attributes")
230+
51231

52232
def create_analyze_file_tool(
53233
resource: AgentInternalToolResourceConfig, llm: BaseChatModel
@@ -95,16 +275,30 @@ async def tool_fn(**kwargs: Any):
95275

96276
masker: PiiMasker | None = None
97277
if client is not None and PiiMasker.is_policy_enabled(policy):
98-
masker = PiiMasker(client, policy)
99-
try:
100-
analysis_task, files = await masker.apply(analysis_task, files)
101-
except Exception as exc:
102-
raise AgentRuntimeError(
103-
code=AgentRuntimeErrorCode.UNEXPECTED_ERROR,
104-
title="PII masking failed",
105-
detail=f"PII detection raised: {exc!r}",
106-
category=UiPathErrorCategory.SYSTEM,
107-
) from exc
278+
# Reconcile OTel current span with the LangChain/LangGraph external
279+
# span provider so the new span is parented under the active tool
280+
# call span and shares its trace id.
281+
parent_ctx = UiPathSpanUtils.get_parent_context()
282+
tracer = otel_trace.get_tracer(__name__)
283+
with tracer.start_as_current_span(
284+
"PII Masking", context=parent_ctx
285+
) as pii_span:
286+
# Required for the LLMOps exporter's span filter to keep this span.
287+
pii_span.set_attribute("uipath.custom_instrumentation", True)
288+
pii_span.set_attribute("span_type", "piiMasking")
289+
pii_span.set_attribute("type", "piiMasking")
290+
masker = PiiMasker(client, policy)
291+
try:
292+
analysis_task, files = await masker.apply(analysis_task, files)
293+
_emit_pii_masking_attachments(pii_span, files)
294+
except Exception as exc:
295+
pii_span.record_exception(exc)
296+
raise AgentRuntimeError(
297+
code=AgentRuntimeErrorCode.UNEXPECTED_ERROR,
298+
title="PII masking failed",
299+
detail=f"PII detection raised: {exc!r}",
300+
category=UiPathErrorCategory.SYSTEM,
301+
) from exc
108302

109303
try:
110304
human_message = HumanMessage(content=analysis_task)
@@ -122,6 +316,7 @@ async def tool_fn(**kwargs: Any):
122316
cast(AnyMessage, human_message_with_files),
123317
]
124318
config = var_child_runnable_config.get(None)
319+
config = _config_with_llm_call_attachments(config, files)
125320
result = await non_streaming_llm.ainvoke(messages, config=config)
126321

127322
del messages, human_message_with_files, files
@@ -198,6 +393,7 @@ async def _resolve_job_attachment_arguments(
198393
url=blob_info.uri,
199394
name=blob_info.name,
200395
mime_type=mime_type,
396+
attachment_id=str(attachment_id),
201397
)
202398
file_infos.append(file_info)
203399

@@ -222,7 +418,17 @@ async def add_files_to_message(
222418

223419
file_content_blocks: list[DataContentBlock] = []
224420
for file in files:
225-
blocks = await build_file_content_blocks_for(file)
421+
# Prefer the redacted URL + pii_masked_ name for LLM content when PII masking ran.
422+
llm_file = (
423+
FileInfo(
424+
url=file.masked_attachment_url,
425+
name=_masked_name_for(file.name),
426+
mime_type=file.mime_type,
427+
)
428+
if file.masked_attachment_url
429+
else file
430+
)
431+
blocks = await build_file_content_blocks_for(llm_file)
226432
file_content_blocks.extend(blocks)
227433
return append_content_blocks_to_message(
228434
message, cast(list[ContentBlock], file_content_blocks)

0 commit comments

Comments
 (0)