databricks-solutions
diff --git a/‎.test/README.md‎
Lines changed: 8 additions & 8 deletions b/‎.test/README.md‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎.test/src/skill_test/agent/executor.py‎
Lines changed: 46 additions & 13 deletions b/‎.test/src/skill_test/agent/executor.py‎
Lines changed: 46 additions & 13 deletions
diff --git a/‎.test/src/skill_test/optimize/agent_evaluator.py‎
Lines changed: 12 additions & 0 deletions b/‎.test/src/skill_test/optimize/agent_evaluator.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎.test/src/skill_test/optimize/judge_tools.py‎
Lines changed: 141 additions & 0 deletions b/‎.test/src/skill_test/optimize/judge_tools.py‎
Lines changed: 141 additions & 0 deletions
@@ -67,14 +67,14 @@ Agent evaluation runs a real Claude Code instance via the [Claude Agent SDK](htt
 }
 ```
 
-| Field | Purpose |
-|-------|---------|
-| `ANTHROPIC_MODEL` | Default model the agent uses |
-| `ANTHROPIC_BASE_URL` | Claude API endpoint (Databricks AI Gateway or direct) |
-| `ANTHROPIC_AUTH_TOKEN` | Auth token — supports `${VAR:-default}` interpolation |
-| `ANTHROPIC_CUSTOM_HEADERS` | Extra headers (e.g., coding agent mode for Databricks) |
-| `DATABRICKS_CONFIG_PROFILE` | Databricks CLI profile for MCP tools |
-| `DATABRICKS_API_KEY` | Databricks token for MCP tool calls |
+| Field | Purpose                                                                 |
+|-------|-------------------------------------------------------------------------|
+| `ANTHROPIC_MODEL` | Default model the agent uses. Currently points to Databricks by default |
+| `ANTHROPIC_BASE_URL` | Claude API endpoint (Databricks AI Gateway or direct)                   |
+| `ANTHROPIC_AUTH_TOKEN` | Auth token — supports `${VAR:-default}` interpolation                   |
+| `ANTHROPIC_CUSTOM_HEADERS` | Extra headers (e.g., coding agent mode for Databricks)                  |
+| `DATABRICKS_CONFIG_PROFILE` | Databricks CLI profile for MCP tools                                    |
+| `DATABRICKS_API_KEY` | Databricks token for MCP tool calls                                     |
 
 The `${VAR:-default}` syntax lets you reference environment variables with fallbacks. The agent runs with `bypassPermissions` mode so it doesn't prompt for tool approval.
 
 
@@ -33,6 +33,25 @@
 _mlflow_env_lock = threading.Lock()
 _mlflow_env_configured = False
 
+# Serialize process_transcript calls across parallel agents to avoid
+# burst HTTP load on the MLflow tracking server when multiple agents
+# finish concurrently (e.g. --parallel-agents 3).
+# Lazy per-loop factory: asyncio.Semaphore binds to the running loop at
+# creation time. When _run_in_fresh_loop creates a new loop the module-level
+# semaphore would crash with "attached to a different loop". Instead we
+# cache one semaphore per event-loop id.
+_transcript_semaphores: dict[int, asyncio.Semaphore] = {}
+_transcript_semaphore_lock = threading.Lock()
+
+
+def _get_transcript_semaphore() -> asyncio.Semaphore:
+    """Return a Semaphore(1) bound to the current running event loop."""
+    loop_id = id(asyncio.get_running_loop())
+    with _transcript_semaphore_lock:
+        if loop_id not in _transcript_semaphores:
+            _transcript_semaphores[loop_id] = asyncio.Semaphore(1)
+        return _transcript_semaphores[loop_id]
+
 
 @dataclass
 class AgentEvent:
@@ -391,20 +410,34 @@ async def mlflow_stop_hook(input_data, tool_use_id, context):
             # Run process_transcript synchronously — it does HTTP I/O per span
             # so can take 20-40s for large sessions. Use a generous timeout to
             # prevent hangs from rate limits or network issues.
+            # Serialize across parallel agents to avoid burst HTTP load on the
+            # MLflow tracking server when multiple agents finish concurrently.
             loop = asyncio.get_running_loop()
-            try:
-                trace = await asyncio.wait_for(
-                    loop.run_in_executor(None, process_transcript, transcript_path, session_id),
-                    timeout=120.0,
-                )
-            except asyncio.TimeoutError:
-                print(
-                    f"    [MLflow] ERROR: process_transcript timed out after 120s "
-                    f"(session={session_id}). This may indicate rate limiting or "
-                    f"network issues. Continuing without trace."
-                )
-                result_holder["trace"] = None
-                return {"continue": True}
+            max_retries = 3
+            trace = None
+            async with _get_transcript_semaphore():
+                for attempt in range(max_retries):
+                    try:
+                        trace = await asyncio.wait_for(
+                            loop.run_in_executor(None, process_transcript, transcript_path, session_id),
+                            timeout=300.0,
+                        )
+                        break
+                    except asyncio.TimeoutError:
+                        if attempt < max_retries - 1:
+                            wait = 2 ** (attempt + 1)  # 2s, 4s
+                            print(
+                                f"    [MLflow] process_transcript attempt {attempt + 1} timed out, "
+                                f"retrying in {wait}s..."
+                            )
+                            await asyncio.sleep(wait)
+                        else:
+                            print(
+                                f"    [MLflow] ERROR: process_transcript timed out after {max_retries} "
+                                f"attempts (session={session_id}). Continuing without trace."
+                            )
+                            result_holder["trace"] = None
+                            return {"continue": True}
 
             result_holder["trace"] = trace
 
 
@@ -157,6 +157,9 @@ def __init__(
         self._mlflow_experiment = mlflow_experiment
         self._skill_name = skill_name
 
+        # Cache WITH-skill evaluation results keyed on (prompt_hash, candidate_hash)
+        self._with_skill_cache: dict[str, tuple[float, dict]] = {}
+
         # Caches for WITHOUT-skill runs (keyed by prompt hash)
         self._baseline_response_cache: dict[str, str] = {}
         self._baseline_trace_cache: dict[str, dict] = {}
@@ -258,6 +261,12 @@ def _evaluate(
         skill_md = candidate.get("skill_md", "")
         prompt = example.get("input", "")
 
+        # Check candidate-level cache
+        candidate_hash = hashlib.sha256(json.dumps(candidate, sort_keys=True).encode()).hexdigest()[:16]
+        cache_key = f"{_prompt_hash(prompt)}:{candidate_hash}"
+        if cache_key in self._with_skill_cache:
+            return self._with_skill_cache[cache_key]
+
         # Decode expectations
         expectations: dict[str, Any] = {}
         expectations_json = example.get("additional_context", {}).get("expectations", "")
@@ -629,6 +638,9 @@ def _judge_with_fallback(
                 f"guideline_adherence={guideline_adherence_score:.2f}"
             )
 
+        # Store in candidate-level cache
+        self._with_skill_cache[cache_key] = (final_score, side_info)
+
         return final_score, side_info
 
 
 
@@ -0,0 +1,141 @@
+"""Custom judge tools for adaptive evaluation criteria loading.
+
+Implements ``ReadSkillTool`` and ``ReadSkillReferenceTool`` from the MLflow
+#21255 design spec, registered in MLflow's global ``JudgeToolRegistry`` so
+they are available to any trace-based judge.
+
+Key difference from spec: tools accept ``trace: Trace`` (required by the
+``JudgeTool`` interface) but use the internal ``EvalCriteriaSet`` for skill
+lookup.  When the native ``make_judge(skills=[...])`` API lands, replace
+this module with MLflow's built-in skill tools which route via type
+annotation.
+
+Registry invocation flow::
+
+    registry.invoke(tool_call, trace)
+      → json.loads(tool_call.function.arguments)
+      → tool.invoke(trace, **parsed_args)
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+from typing import Any
+
+from mlflow.entities.trace import Trace
+from mlflow.genai.judges.tools.base import JudgeTool
+from mlflow.genai.judges.tools.registry import register_judge_tool
+from mlflow.types.llm import FunctionToolDefinition, ParamProperty, ToolDefinition, ToolParamsSchema
+
+from .eval_criteria import EvalCriteriaSet
+
+logger = logging.getLogger(__name__)
+
+
+class ReadEvalCriteriaTool(JudgeTool):
+    """Read the full body of an evaluation-criteria skill.
+
+    The judge calls this tool when a criteria's description matches the
+    trace it is evaluating.
+    """
+
+    def __init__(self, criteria_set: EvalCriteriaSet):
+        self._criteria_set = criteria_set
+
+    @property
+    def name(self) -> str:
+        return "read_eval_criteria"
+
+    def get_definition(self) -> ToolDefinition:
+        available = self._criteria_set.names
+        return ToolDefinition(
+            function=FunctionToolDefinition(
+                name="read_eval_criteria",
+                description=(
+                    "Read the full content of an evaluation criteria skill to get domain-specific "
+                    "rubrics, scoring rules, and reference material.  Use this when a criteria's "
+                    f"description matches the trace content.  Available criteria: {available}"
+                ),
+                parameters=ToolParamsSchema(
+                    properties={
+                        "skill_name": ParamProperty(
+                            type="string", description="Name of the evaluation criteria to read"
+                        ),
+                    },
+                ),
+            ),
+        )
+
+    def invoke(self, trace: Trace, skill_name: str) -> str:
+        skill = self._criteria_set.get_skill(skill_name)
+        if not skill:
+            available = self._criteria_set.names
+            return f"Error: No criteria named '{skill_name}'. Available: {available}"
+        return skill.body
+
+
+class ReadEvalReferenceTool(JudgeTool):
+    """Read a reference document from a criteria's ``references/`` directory.
+
+    Used for detailed rubrics, edge cases, and scoring examples.
+    """
+
+    def __init__(self, criteria_set: EvalCriteriaSet):
+        self._criteria_set = criteria_set
+
+    @property
+    def name(self) -> str:
+        return "read_eval_reference"
+
+    def get_definition(self) -> ToolDefinition:
+        return ToolDefinition(
+            function=FunctionToolDefinition(
+                name="read_eval_reference",
+                description=(
+                    "Read a reference document from an evaluation criteria skill for detailed "
+                    "rubrics, edge cases, or scoring examples."
+                ),
+                parameters=ToolParamsSchema(
+                    properties={
+                        "skill_name": ParamProperty(type="string", description="Name of the evaluation criteria"),
+                        "file_path": ParamProperty(
+                            type="string",
+                            description="Relative path within the skill (e.g., 'references/RUBRIC.md')",
+                        ),
+                    },
+                ),
+            ),
+        )
+
+    def invoke(self, trace: Trace, skill_name: str, file_path: str) -> str:
+        skill = self._criteria_set.get_skill(skill_name)
+        if not skill:
+            available = self._criteria_set.names
+            return f"Error: No criteria named '{skill_name}'. Available: {available}"
+        normalized = os.path.normpath(file_path)
+        if normalized.startswith("..") or os.path.isabs(normalized):
+            return f"Error: Invalid file path '{file_path}'. Must be relative."
+        if normalized not in skill.references:
+            return f"Error: File '{file_path}' not found in '{skill_name}'"
+        return skill.references[normalized]
+
+
+_registered = False
+
+
+def register_eval_tools(criteria_set: EvalCriteriaSet) -> None:
+    """Register eval-criteria tools in MLflow's global ``JudgeToolRegistry``.
+
+    Safe to call multiple times — tools are registered only once per process.
+    """
+    global _registered
+    if _registered:
+        return
+    if not criteria_set.skills:
+        logger.debug("No eval criteria loaded; skipping tool registration")
+        return
+    register_judge_tool(ReadEvalCriteriaTool(criteria_set))
+    register_judge_tool(ReadEvalReferenceTool(criteria_set))
+    _registered = True
+    logger.info("Registered eval criteria judge tools (%d criteria available)", len(criteria_set.skills))