fix: redact sensitive data from evaluation SDK log messages (#45176)

slister1001 · Copilot · web-flow · commit 8a4c20821835 · 2026-02-17T16:15:07.000-05:00
* fix: redact sensitive data from log messages to resolve CredScan alert

Remove user-provided content (queries, responses, tool definitions, exception
messages) from log strings that flow into Geneva telemetry. This prevents
database connection strings and other credentials embedded in user payloads
from being flagged by CredScan.

Changes:
- Remove f-string interpolation of query/response/tool_definitions in
  warning and debug log messages
- Downgrade noisy agent-response warnings to debug level
- Sanitize upload error messages to emit only exception type name
- Chain original exception with 'from e'

Resolves ICM 738457593

* fix: call PROXY_URL() as function instead of using as value

PROXY_URL in devtools_testutils.config is now a function, not a constant.

Co-authored-by: Copilot &lt;223556219+Copilot@users.noreply.github.com&gt;

---------

Co-authored-by: Copilot &lt;223556219+Copilot@users.noreply.github.com&gt;
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py
@@ -706,7 +706,7 @@ def reformat_conversation_history(query, logger=None, include_system_messages=Fa
         #   Lower percentage of mode in Likert scale (73.4% vs 75.4%)
         #   Lower pairwise agreement between LLMs (85% vs 90% at the pass/fail level with threshold of 3)
         if logger:
-            logger.warning(f"Conversation history could not be parsed, falling back to original query: {query}")
+            logger.warning("Conversation history could not be parsed, falling back to original query")
         return query
 
 
@@ -761,15 +761,15 @@ def reformat_agent_response(response, logger=None, include_tool_messages=False):
             # If no message could be extracted, likely the format changed, fallback to the original response in that case
             if logger:
                 logger.debug(
-                    f"Empty agent response extracted, likely due to input schema change. Falling back to using the original response: {response}"
+                    "Empty agent response extracted, likely due to input schema change. Falling back to original response"
                 )
             return response
         return "\n".join(agent_response)
-    except:
+    except Exception:
         # If the agent response cannot be parsed for whatever reason (e.g. the converter format changed), the original response is returned
         # This is a fallback to ensure that the evaluation can still proceed. See comments on reformat_conversation_history for more details.
         if logger:
-            logger.debug(f"Agent response could not be parsed, falling back to original response: {response}")
+            logger.debug("Agent response could not be parsed, falling back to original response")
         return response
 
 
@@ -787,9 +787,7 @@ def reformat_tool_definitions(tool_definitions, logger=None):
         # If the tool definitions cannot be parsed for whatever reason, the original tool definitions are returned
         # This is a fallback to ensure that the evaluation can still proceed. See comments on reformat_conversation_history for more details.
         if logger:
-            logger.warning(
-                f"Tool definitions could not be parsed, falling back to original definitions: {tool_definitions}"
-            )
+            logger.debug("Tool definitions could not be parsed, falling back to original definitions")
         return tool_definitions
 
 
@@ -915,9 +913,9 @@ def upload(path: str, container_client: ContainerClient, logger=None):
 
     except Exception as e:
         raise EvaluationException(
-            message=f"Error uploading file: {e}",
-            internal_message=f"Error uploading file: {e}",
+            message=f"Error uploading file: {type(e).__name__}",
+            internal_message=f"Error uploading file: {type(e).__name__}",
             target=ErrorTarget.RAI_CLIENT,
             category=ErrorCategory.UPLOAD_ERROR,
             blame=ErrorBlame.SYSTEM_ERROR,
-        )
+        ) from e
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/conftest.py b/sdk/evaluation/azure-ai-evaluation/tests/conftest.py
@@ -317,7 +317,7 @@ def simple_conversation():
 def redirect_openai_requests():
     """Route requests from the openai package to the test proxy."""
     config = TestProxyConfig(
-        recording_id=get_recording_id(), recording_mode="record" if is_live() else "playback", proxy_url=PROXY_URL
+        recording_id=get_recording_id(), recording_mode="record" if is_live() else "playback", proxy_url=PROXY_URL()
     )
 
     with TestProxyHttpxClientBase.record_with_proxy(config):

Original file line number	Diff line number	Diff line change
`@@ -317,7 +317,7 @@ def simple_conversation():`
`317`	`317`	`def redirect_openai_requests():`
`318`	`318`	`"""Route requests from the openai package to the test proxy."""`
`319`	`319`	`config = TestProxyConfig(`
`320`		`- recording_id=get_recording_id(), recording_mode="record" if is_live() else "playback", proxy_url=PROXY_URL`
	`320`	`+ recording_id=get_recording_id(), recording_mode="record" if is_live() else "playback", proxy_url=PROXY_URL()`
`321`	`321`	`)`
`322`	`322`
`323`	`323`	`with TestProxyHttpxClientBase.record_with_proxy(config):`