From d1caa77204710a41de1110d62827c159ed47f08f Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Wed, 13 May 2026 17:43:09 +0800 Subject: [PATCH 01/42] feat(benchmark): add TAU-2 trajectory memory treatment --- benchmark/tau2/README.md | 35 +++++++++--- benchmark/tau2/config/trajectory.yaml | 22 ++++++++ benchmark/tau2/scripts/run_eval.py | 8 ++- benchmark/tau2/scripts/run_memory_v2_eval.py | 4 +- .../templates/memory/trajectories.yaml | 54 ++++++++++--------- .../agent_trajectory_context_provider.py | 14 +++-- 6 files changed, 96 insertions(+), 41 deletions(-) create mode 100644 benchmark/tau2/config/trajectory.yaml diff --git a/benchmark/tau2/README.md b/benchmark/tau2/README.md index 7ebdb0807e..f53f7c99f1 100644 --- a/benchmark/tau2/README.md +++ b/benchmark/tau2/README.md @@ -1,13 +1,13 @@ # TAU-2 Benchmark This directory contains a small OpenViking-style entry point for TAU-2 memory -evaluation. The first version is intentionally narrow: +evaluation. The scope is intentionally narrow: - fresh OpenViking Memory V2 experience-only baseline; - Memory V2 pre-write recall treatment. +- trajectory-view retrieval treatment for the refined trajectory prompt. -Trajectory / procedure-view prompts, category rerank, and other harness-only -diagnostics are intentionally left out of this first PR. +Category rerank and other harness-only diagnostics are intentionally left out. ## Layout @@ -16,7 +16,8 @@ benchmark/tau2/ ├── config/ │ ├── baseline.yaml │ ├── official.yaml -│ └── prewrite.yaml +│ ├── prewrite.yaml +│ └── trajectory.yaml ├── scripts/ │ ├── run_eval.py │ ├── setup_tau2_repo.sh @@ -77,6 +78,18 @@ benchmark/tau2/run_full_eval.sh \ --repeat-count 1 ``` +Plan a one-cell trajectory-view smoke: + +```bash +benchmark/tau2/run_full_eval.sh \ + --config benchmark/tau2/config/trajectory.yaml \ + --domain retail \ + --strategy-id memory_v2_trajectory_view \ + --num-tasks 1 \ + --train-num-tasks 1 \ + --repeat-count 1 +``` + Run the Memory V2 8-trial matrix (`retail + airline` x 2 strategies x 8 repeats): ```bash @@ -104,20 +117,26 @@ and `OPENAI_API_BASE` for LiteLLM before running upstream TAU-2. Start the OpenViking service before executing memory cells, and verify it with `ov status`. For evidence runs, use a clean OpenViking workspace/config and set `OPENVIKING_URL` explicitly so local custom memory templates do not pollute the -Memory V2 baseline. +Memory V2 baseline. For trajectory-view evidence, start the service from this +branch and inspect generated trajectory files; changing `search_uri` alone does +not prove the new trajectory prompt was used. ## Memory Adapter -`memory_v2_experience_only` and `memory_v2_prewrite` cells run through a small -TAU-2 agent adapter in this directory: +Memory V2 cells run through a small TAU-2 agent adapter in this directory: - train by writing TAU-2 training conversations into OpenViking sessions; -- evaluate by retrieving OpenViking experience memory at the first user turn; +- evaluate by retrieving OpenViking memory at the first user turn; - for pre-write recall, retrieve again before write-like tool calls and regenerate that step with the matched memories; - emit artifact metadata to identify the OpenViking account, agent, corpus, retrieval mode, and simulator policy used by each cell. +The existing `train_memory_mode: experience_only` value selects the Memory V2 +session-commit path. `search_memory_type` selects which generated memory bucket +is retrieved during eval (`experiences` by default, `trajectories` for +`config/trajectory.yaml`). + ## User Simulator Policy The runner default is the official TAU-2 user simulator if diff --git a/benchmark/tau2/config/trajectory.yaml b/benchmark/tau2/config/trajectory.yaml new file mode 100644 index 0000000000..5aad55fae1 --- /dev/null +++ b/benchmark/tau2/config/trajectory.yaml @@ -0,0 +1,22 @@ +extends: baseline.yaml + +benchmark: + name: tau2_openviking_trajectory_view + +strategies: + - id: memory_v2_trajectory_view + label: OpenViking Memory V2 trajectory-view first-user recall + memory_backend: openviking + train_required: true + corpus_id: memory_v2_trajectory_view + train_memory_mode: experience_only + search_memory_type: trajectories + retrieval_mode: first_user + - id: memory_v2_trajectory_prewrite + label: OpenViking Memory V2 trajectory-view pre-write recall + memory_backend: openviking + train_required: true + corpus_id: memory_v2_trajectory_view + train_memory_mode: experience_only + search_memory_type: trajectories + retrieval_mode: first_user_prewrite diff --git a/benchmark/tau2/scripts/run_eval.py b/benchmark/tau2/scripts/run_eval.py index 5458ba61ac..2bfa6b877f 100755 --- a/benchmark/tau2/scripts/run_eval.py +++ b/benchmark/tau2/scripts/run_eval.py @@ -90,7 +90,12 @@ def _tau2_command( account = f"{openviking['account']}-{configured_run_id}-{domain}-{corpus_id}" agent_id = f"{openviking['agent_id']}-{domain}-{corpus_id}" user = f"tau2-{domain}-{corpus_id}" - search_uri = f"viking://agent/{agent_id}/memories/experiences" + search_memory_type = str(strategy.get("search_memory_type", "experiences")) + if search_memory_type not in {"experiences", "trajectories"}: + raise ValueError( + f"Unsupported search_memory_type for {strategy['id']}: {search_memory_type}" + ) + search_uri = f"viking://agent/{agent_id}/memories/{search_memory_type}" command = [ sys.executable, str(Path(__file__).with_name("run_memory_v2_eval.py")), @@ -257,6 +262,7 @@ def _build_plan( "memory_backend": strategy.get("memory_backend"), "corpus_id": strategy.get("corpus_id", strategy["id"]), "retrieval_mode": strategy.get("retrieval_mode"), + "search_memory_type": strategy.get("search_memory_type", "experiences"), "adapter_status": strategy.get("adapter_status", "ready"), "executable": command is not None, "user_simulator_policy": user_simulator_policy(config), diff --git a/benchmark/tau2/scripts/run_memory_v2_eval.py b/benchmark/tau2/scripts/run_memory_v2_eval.py index de5ef54411..75757415c6 100644 --- a/benchmark/tau2/scripts/run_memory_v2_eval.py +++ b/benchmark/tau2/scripts/run_memory_v2_eval.py @@ -488,7 +488,7 @@ def generate_next_message(self, message, state: LLMAgentState): prompt = ( "No OpenViking memory matched this user request." if not block - else "Use these OpenViking experience memories only when they match the current task:\n\n" + else "Use these OpenViking memories only when they match the current task:\n\n" + block ) state.system_messages[marker_index] = SystemMessage(role="system", content=prompt) @@ -528,7 +528,7 @@ def generate_next_message(self, message, state: LLMAgentState): if block: prompt = ( "Before executing the pending write-like tool call, use these " - "OpenViking experience memories only when they match the current task:\n\n" + "OpenViking memories only when they match the current task:\n\n" + block ) assistant_message = self._generate( diff --git a/openviking/prompts/templates/memory/trajectories.yaml b/openviking/prompts/templates/memory/trajectories.yaml index bd894dfefa..ebd52c1069 100644 --- a/openviking/prompts/templates/memory/trajectories.yaml +++ b/openviking/prompts/templates/memory/trajectories.yaml @@ -1,6 +1,6 @@ memory_type: trajectories description: | - A record of agent execution in one business domain within a conversation. + A compact, reusable view of how the agent handled one task in a conversation. Extract when the agent worked through identifiable tasks involving decisions, tool calls, or multi-step actions. Skip pure chitchat or simple Q&A with no execution trace. @@ -32,31 +32,35 @@ fields: - name: content type: string description: | - Execution trace in EXACTLY this format: - - Goal: - - Trajectory: - 1. . - Actions: . - Progress: . - 2. <...>. - Actions: <...>. - Progress: <...>. - 3. <... continue sequentially to capture the entire execution> - - Result: - - Fail reason: - - Rules for 'Trajectory' section: - - STANDALONE COMPLETENESS: The trajectory must be comprehensive enough that a reader can fully understand exactly how the agent performed the task, what attempts it made, and what tools it used, entirely without the raw logs. - - EXHAUSTIVE TRACKING: Record every logical interaction, tool use, and system response in chronological order as a numbered list. - - SUMMARIZE LONG TEXTS: Do NOT record exact tool responses, raw JSON payloads, or verbatim user/agent messages, especially when they are very long. You MUST use a concise, summarized version that captures the core meaning without losing any key information, constraints, or data points critical to the task. - - CAPTURE ALL MISTAKES: You MUST explicitly detail any errors, false attempts, wrong function calls, or agent misunderstandings. Do not gloss over failures, dead ends, or retry loops. - - INTENT & PROGRESS TRACKING: Every step must clearly state the current agent intention, summarize the 'Actions/Events' taken, and conclude with 'Progress' (evaluating what was achieved or blocked). - - TOOL TRACKING (under 'Actions'): - * Format (Success): "Called -> Response: ." - * Format (Error/Mistake): "Called -> Error/Issue: | Context: ." + Procedure-like trajectory view in EXACTLY this format: + + - Trigger: + - Preconditions: + 1. + 2. <...> + - Procedure: + 1. + 2. + 3. + - Anti-patterns: + - + - Applicability Boundary: + - + - + - Result: + - Evidence: + + Rules: + - Write for future agent execution, not for human audit. Prefer clear instructions over chronological narration. + - Preserve the successful or best-known path: critical reads, required policy checks, confirmation steps, write-tool ordering, and final user-facing completion. + - Keep negative lessons in Anti-patterns, not mixed into Procedure. + - Keep the memory grounded in this session, but abstract away user-specific names, raw IDs, exact payloads, and long tool responses unless they are essential to the reusable procedure. + - Mention tool names when they are part of the reusable path, but summarize observations instead of copying raw JSON. + - If the session failed or was partial, still write the best reusable lesson: put the corrected approach in Procedure and the failure cause in Anti-patterns / Evidence. + - Avoid broad SOPs. The Trigger and Applicability Boundary should make this record narrower than a whole domain workflow. General Rules: - - Use exactly the 4 labels (Goal, Trajectory, Result, Fail reason) in this exact order. - - Goal and Result are ONE sentence each. + - Use exactly the 7 labels above in this exact order. + - Trigger, Result, and Evidence are ONE sentence each. - No extra headings, free paragraphs, or closing remarks. merge_op: patch diff --git a/openviking/session/memory/agent_trajectory_context_provider.py b/openviking/session/memory/agent_trajectory_context_provider.py index 2316052f77..0621da1246 100644 --- a/openviking/session/memory/agent_trajectory_context_provider.py +++ b/openviking/session/memory/agent_trajectory_context_provider.py @@ -24,14 +24,18 @@ class AgentTrajectoryContextProvider(SessionExtractContextProvider): - """Phase 1 provider: extract trajectory summaries from conversation.""" + """Phase 1 provider: extract reusable trajectory-view memories.""" def instruction(self) -> str: output_language = self._output_language - return f"""You are a memory extraction agent. Summarize this agent session as a trajectory record. - -One session = one trajectory. Always output exactly one, no exceptions. -Sub-tasks, pivots, errors, and follow-ups are numbered steps inside that one record — not separate trajectories. + return f"""You are a memory extraction agent. Convert this agent session into a reusable trajectory-view memory. + +One session = one trajectory-view record. Always output exactly one record. +Write the record as a compact procedure-like view of the useful execution pattern, +not as a raw transcript. Keep the future agent's decision points, tool path, +confirmation/write boundary, failure corrections, and applicability boundary. +Sub-tasks, pivots, errors, and follow-ups are folded into that one record as steps, +guardrails, or evidence — not separate trajectories. Output a JSON object with a `trajectories` array containing exactly one item. Follow field descriptions in the schema. JSON only, no explanation. From a68d5e7672e2cff6c1d6ed3961d7de717b70f3b8 Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Wed, 13 May 2026 18:31:24 +0800 Subject: [PATCH 02/42] style(benchmark): format tau2 trajectory scripts --- benchmark/tau2/scripts/run_eval.py | 64 ++++++++++++------- benchmark/tau2/scripts/run_memory_v2_eval.py | 30 ++++++--- .../agent_trajectory_context_provider.py | 1 - 3 files changed, 62 insertions(+), 33 deletions(-) diff --git a/benchmark/tau2/scripts/run_eval.py b/benchmark/tau2/scripts/run_eval.py index 2bfa6b877f..7a38a34d41 100755 --- a/benchmark/tau2/scripts/run_eval.py +++ b/benchmark/tau2/scripts/run_eval.py @@ -12,8 +12,8 @@ from tau2_common import ( domains, load_config, - output_dir, normalize_litellm_env, + output_dir, run_id, simulator_policy_report, split_file, @@ -55,7 +55,9 @@ def _metrics_from_tau2_results(results_path: Path) -> dict[str, Any]: return { "simulation_count": len(sims), "avg_reward": sum(rewards) / len(rewards) if rewards else 0.0, - "db_match_rate": (sum(1 for value in db_known if value) / len(db_known)) if db_known else None, + "db_match_rate": (sum(1 for value in db_known if value) / len(db_known)) + if db_known + else None, } @@ -104,11 +106,7 @@ def _tau2_command( "--run-dir", str(output_dir(config, configured_run_id) / "memory_cells" / run_label), "--corpus-dir", - str( - output_dir(config, configured_run_id) - / "memory_corpora" - / f"{domain}_{corpus_id}" - ), + str(output_dir(config, configured_run_id) / "memory_corpora" / f"{domain}_{corpus_id}"), "--run-label", run_label, "--strategy-id", @@ -153,7 +151,9 @@ def _tau2_command( command.extend(["--task-id", task_id]) elif num_tasks is not None: command.extend(["--num-tasks", str(num_tasks)]) - train_num_tasks = train_num_tasks if train_num_tasks is not None else strategy.get("train_num_tasks") + train_num_tasks = ( + train_num_tasks if train_num_tasks is not None else strategy.get("train_num_tasks") + ) if train_num_tasks is not None: command.extend(["--train-num-tasks", str(train_num_tasks)]) return command @@ -219,7 +219,9 @@ def _build_plan( unknown = selected_strategy_ids - set(strategy_ids(config)) if unknown: raise ValueError(f"unknown strategy ids: {sorted(unknown)}") - strategies = [strategy for strategy in strategies if strategy["id"] in selected_strategy_ids] + strategies = [ + strategy for strategy in strategies if strategy["id"] in selected_strategy_ids + ] cells = [] plan_domains = domains(config) if selected_domains: @@ -299,9 +301,7 @@ def _cell_artifacts(cell: dict[str, Any], repo: Path, out: Path) -> dict[str, st "retrieval_trace": str(run_dir / f"{cell['run_label']}.retrieval_trace.jsonl"), "corpus_manifest": str(corpus_dir / "corpus_manifest.json"), } - return { - "results": str(repo / "data" / "simulations" / f"{cell['run_label']}.json") - } + return {"results": str(repo / "data" / "simulations" / f"{cell['run_label']}.json")} def _cell_metrics(cell: dict[str, Any], artifacts: dict[str, str]) -> dict[str, Any] | None: @@ -333,7 +333,9 @@ def weighted(rows_for_group: list[dict[str, Any]]) -> dict[str, Any]: if row["metrics"].get("db_match_rate") is not None and int(row["metrics"].get("simulation_count") or 0) > 0 ] - db_weight = sum(int(row["metrics"].get("simulation_count") or 0) for row in db_weighted_rows) + db_weight = sum( + int(row["metrics"].get("simulation_count") or 0) for row in db_weighted_rows + ) db_sum = sum( float(row["metrics"]["db_match_rate"]) * int(row["metrics"].get("simulation_count") or 0) @@ -409,7 +411,9 @@ def _execute_cells(plan: dict[str, Any], repo: Path, out: Path) -> list[dict[str rows.append(row) write_json(out / "cell_results" / f"{cell['run_label']}.json", row) if completed.returncode != 0: - raise RuntimeError(f"cell failed: {cell['run_label']} returncode={completed.returncode}") + raise RuntimeError( + f"cell failed: {cell['run_label']} returncode={completed.returncode}" + ) return rows @@ -425,7 +429,9 @@ def _preflight(config: dict[str, Any], out: Path, *, strict: bool) -> int: if strict and not llm_env["has_api_key"]: errors.append("missing LLM API key: set OPENAI_API_KEY or ARK_API_KEY") if strict and not llm_env["has_base_url"]: - errors.append("missing OpenAI-compatible base URL: set OPENAI_API_BASE, OPENAI_BASE_URL, or ARK_BASE_URL") + errors.append( + "missing OpenAI-compatible base URL: set OPENAI_API_BASE, OPENAI_BASE_URL, or ARK_BASE_URL" + ) if strict and not policy_report["supported"]: errors.append( "configured confirmation-aware user simulator policy requires a TAU-2 " @@ -469,14 +475,28 @@ def _preflight(config: dict[str, Any], out: Path, *, strict: bool) -> int: def main() -> int: parser = argparse.ArgumentParser(description="Plan or run TAU-2 benchmark cells.") - parser.add_argument("--config", type=Path, default=Path(__file__).parents[1] / "config" / "baseline.yaml") + parser.add_argument( + "--config", type=Path, default=Path(__file__).parents[1] / "config" / "baseline.yaml" + ) parser.add_argument("--run-id", default=run_id()) - parser.add_argument("--domain", action="append", help="Run only this configured domain; may be repeated.") - parser.add_argument("--repeat-count", type=int, help="Override benchmark.repeat_count for smoke runs.") - parser.add_argument("--strategy-id", action="append", help="Run only this strategy id; may be repeated.") - parser.add_argument("--task-id", action="append", help="Run only this TAU-2 task id; may be repeated.") - parser.add_argument("--num-tasks", type=int, help="Run the first N tasks from the selected split.") - parser.add_argument("--train-num-tasks", type=int, help="Train OpenViking memory on the first N train tasks.") + parser.add_argument( + "--domain", action="append", help="Run only this configured domain; may be repeated." + ) + parser.add_argument( + "--repeat-count", type=int, help="Override benchmark.repeat_count for smoke runs." + ) + parser.add_argument( + "--strategy-id", action="append", help="Run only this strategy id; may be repeated." + ) + parser.add_argument( + "--task-id", action="append", help="Run only this TAU-2 task id; may be repeated." + ) + parser.add_argument( + "--num-tasks", type=int, help="Run the first N tasks from the selected split." + ) + parser.add_argument( + "--train-num-tasks", type=int, help="Train OpenViking memory on the first N train tasks." + ) parser.add_argument( "--preflight", action="store_true", diff --git a/benchmark/tau2/scripts/run_memory_v2_eval.py b/benchmark/tau2/scripts/run_memory_v2_eval.py index 75757415c6..64384f4d60 100644 --- a/benchmark/tau2/scripts/run_memory_v2_eval.py +++ b/benchmark/tau2/scripts/run_memory_v2_eval.py @@ -11,7 +11,6 @@ from tau2_common import normalize_litellm_env - AGENT_NAME = "openviking_memory_agent" REPO_ROOT = Path(__file__).resolve().parents[3] WRITE_TOOL_PREFIXES = ( @@ -86,7 +85,9 @@ def _metrics(results_path: Path) -> dict[str, Any]: return { "simulation_count": len(sims), "avg_reward": sum(rewards) / len(rewards) if rewards else 0.0, - "db_match_rate": (sum(1 for value in db_known if value) / len(db_known)) if db_known else None, + "db_match_rate": (sum(1 for value in db_known if value) / len(db_known)) + if db_known + else None, } @@ -118,12 +119,14 @@ def _tool_call_query(tool_calls: list[Any], state_messages: list[Any]) -> str: recent_user = [ str(getattr(message, "content", "") or "") for message in state_messages[-8:] - if str(getattr(message, "role", "")) == "user" and str(getattr(message, "content", "") or "").strip() + if str(getattr(message, "role", "")) == "user" + and str(getattr(message, "content", "") or "").strip() ] recent_observations = [ str(getattr(message, "content", "") or "")[:600] for message in state_messages[-12:] - if str(getattr(message, "role", "")) == "tool" and str(getattr(message, "content", "") or "").strip() + if str(getattr(message, "role", "")) == "tool" + and str(getattr(message, "content", "") or "").strip() ] parts = [ "Before executing write-like tool call(s): " + "; ".join(rendered), @@ -301,7 +304,9 @@ def _train(args: argparse.Namespace, train_results: Path, corpus_manifest: Path) committed = [] try: for sim in data.get("simulations") or []: - session_id = f"tau2-{args.domain}-train-{sim.get('task_id')}-trial-{sim.get('trial', 0)}" + session_id = ( + f"tau2-{args.domain}-train-{sim.get('task_id')}-trial-{sim.get('trial', 0)}" + ) created = client.create_session(session_id=session_id) sid = created.get("session_id", session_id) for msg in sim.get("messages") or []: @@ -373,7 +378,9 @@ def _retrieve(self, query: str) -> tuple[str, list[dict[str, Any]]]: client = _client(args) rows: list[dict[str, Any]] = [] try: - result = client.search(query=query, target_uri=args.search_uri, limit=args.retrieval_top_k) + result = client.search( + query=query, target_uri=args.search_uri, limit=args.retrieval_top_k + ) memories = list(getattr(result, "memories", []) or []) blocks = [] for index, match in enumerate(memories[: args.retrieval_top_k], 1): @@ -404,7 +411,9 @@ def _trace_injection_fields(block: str, matches: list[dict[str, Any]]) -> dict[s return { "injected": bool(block.strip()), "injected_count": injected_count if block.strip() else 0, - "retrieval_action_taken": "retrieve_and_inject" if block.strip() else "retrieve_no_injection", + "retrieval_action_taken": "retrieve_and_inject" + if block.strip() + else "retrieve_no_injection", } def _generate(self, messages): @@ -476,7 +485,8 @@ def generate_next_message(self, message, state: LLMAgentState): ( i for i, item in enumerate(state.system_messages) - if isinstance(item, SystemMessage) and item.content == "" + if isinstance(item, SystemMessage) + and item.content == "" ), None, ) @@ -528,8 +538,7 @@ def generate_next_message(self, message, state: LLMAgentState): if block: prompt = ( "Before executing the pending write-like tool call, use these " - "OpenViking memories only when they match the current task:\n\n" - + block + "OpenViking memories only when they match the current task:\n\n" + block ) assistant_message = self._generate( state.system_messages @@ -540,6 +549,7 @@ def generate_next_message(self, message, state: LLMAgentState): return assistant_message, state if AGENT_NAME not in registry.get_agents(): + def create_openviking_memory_agent(tools, domain_policy, **kwargs): return OpenVikingMemoryAgent( tools=tools, diff --git a/openviking/session/memory/agent_trajectory_context_provider.py b/openviking/session/memory/agent_trajectory_context_provider.py index 0621da1246..6b1280b1d0 100644 --- a/openviking/session/memory/agent_trajectory_context_provider.py +++ b/openviking/session/memory/agent_trajectory_context_provider.py @@ -14,7 +14,6 @@ from openviking.session.memory.session_extract_context_provider import ( SessionExtractContextProvider, ) -from openviking.storage.viking_fs import VikingFS from openviking_cli.utils import get_logger logger = get_logger(__name__) From 4391dd451d9f58176aa1fe473be0c5d76247d442 Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Wed, 13 May 2026 19:03:21 +0800 Subject: [PATCH 03/42] feat(benchmark): add tau2 trajectory category rerank --- benchmark/tau2/README.md | 25 +- benchmark/tau2/config/category_catalog.json | 110 +++++ benchmark/tau2/config/category_rerank.yaml | 32 ++ benchmark/tau2/scripts/category_rerank.py | 457 +++++++++++++++++++ benchmark/tau2/scripts/run_eval.py | 39 ++ benchmark/tau2/scripts/run_memory_v2_eval.py | 57 ++- 6 files changed, 710 insertions(+), 10 deletions(-) create mode 100644 benchmark/tau2/config/category_catalog.json create mode 100644 benchmark/tau2/config/category_rerank.yaml create mode 100644 benchmark/tau2/scripts/category_rerank.py diff --git a/benchmark/tau2/README.md b/benchmark/tau2/README.md index f53f7c99f1..0aac24e2aa 100644 --- a/benchmark/tau2/README.md +++ b/benchmark/tau2/README.md @@ -6,8 +6,10 @@ evaluation. The scope is intentionally narrow: - fresh OpenViking Memory V2 experience-only baseline; - Memory V2 pre-write recall treatment. - trajectory-view retrieval treatment for the refined trajectory prompt. +- experimental category-reranked pre-write recall on top of trajectory-view memory. -Category rerank and other harness-only diagnostics are intentionally left out. +Category rerank is opt-in and experimental; it is meant for PR-C review and +smoke/targeted probes before any productization decision. ## Layout @@ -15,6 +17,7 @@ Category rerank and other harness-only diagnostics are intentionally left out. benchmark/tau2/ ├── config/ │ ├── baseline.yaml +│ ├── category_rerank.yaml │ ├── official.yaml │ ├── prewrite.yaml │ └── trajectory.yaml @@ -90,6 +93,18 @@ benchmark/tau2/run_full_eval.sh \ --repeat-count 1 ``` +Plan a one-cell trajectory category-rerank smoke: + +```bash +benchmark/tau2/run_full_eval.sh \ + --config benchmark/tau2/config/category_rerank.yaml \ + --domain retail \ + --strategy-id memory_v2_trajectory_category_prewrite \ + --num-tasks 1 \ + --train-num-tasks 1 \ + --repeat-count 1 +``` + Run the Memory V2 8-trial matrix (`retail + airline` x 2 strategies x 8 repeats): ```bash @@ -137,6 +152,14 @@ session-commit path. `search_memory_type` selects which generated memory bucket is retrieved during eval (`experiences` by default, `trajectories` for `config/trajectory.yaml`). +`config/category_rerank.yaml` keeps the PR-B trajectory memory route and enables +an adapter-local category rerank only at `before_write_tool_call`. The reranker +loads `config/category_catalog.json`, annotates the runtime query and candidate +memories from visible text/tool names/URIs, retrieves a wider candidate pool, +then injects only the top category-aligned memories. Retrieval traces include +the query category, candidate memory categories, rerank reasons, selected rows, +and skipped rows. + ## User Simulator Policy The runner default is the official TAU-2 user simulator if diff --git a/benchmark/tau2/config/category_catalog.json b/benchmark/tau2/config/category_catalog.json new file mode 100644 index 0000000000..93fde297f2 --- /dev/null +++ b/benchmark/tau2/config/category_catalog.json @@ -0,0 +1,110 @@ +{ + "schema_version": "openviking.tau2.category_catalog.v0", + "description": "Runtime-visible TAU-2 category catalog for the experimental trajectory category-rerank branch. Triggers use user text, candidate write tool names, trajectory text, and memory URIs only.", + "domains": { + "airline": { + "categories": [ + { + "category_id": "airline_reservation_booking:new_booking_or_rebook", + "category1": "airline_reservation_booking", + "category2": "new_booking_or_rebook", + "query_triggers": ["book_reservation", "book flight", "new reservation", "new booking", "rebook", "book a flight"], + "memory_triggers": ["book_reservation", "new_booking", "reservation booking", "new booking", "rebook"], + "negative_triggers": ["cancel_reservation", "update_reservation"] + }, + { + "category_id": "airline_reservation_cancellation:cancellation_refund", + "category1": "airline_reservation_cancellation", + "category2": "cancellation_refund", + "query_triggers": ["cancel_reservation", "cancel flight", "cancel booking", "refund", "cancellation"], + "memory_triggers": ["cancel_reservation", "cancellation", "refund", "cancel booking"], + "negative_triggers": ["book_reservation"] + }, + { + "category_id": "airline_reservation_flight_update:flight_change_update", + "category1": "airline_reservation_flight_update", + "category2": "flight_change_update", + "query_triggers": ["update_reservation_flights", "change flight", "modify flight", "flight update", "alternate flight"], + "memory_triggers": ["update_reservation_flights", "flight update", "change flight", "modify flight"], + "negative_triggers": ["baggage", "passenger"] + }, + { + "category_id": "airline_reservation_baggage_update:baggage_update", + "category1": "airline_reservation_baggage_update", + "category2": "baggage_update", + "query_triggers": ["update_reservation_baggages", "baggage", "bags", "checked bag", "luggage"], + "memory_triggers": ["update_reservation_baggages", "baggage", "checked bag", "luggage"], + "negative_triggers": ["cancel_reservation"] + }, + { + "category_id": "airline_reservation_passenger_update:passenger_update", + "category1": "airline_reservation_passenger_update", + "category2": "passenger_update", + "query_triggers": ["update_reservation_passengers", "passenger", "traveler", "date of birth", "dob"], + "memory_triggers": ["update_reservation_passengers", "passenger update", "traveler", "date of birth"], + "negative_triggers": ["baggage"] + } + ] + }, + "retail": { + "categories": [ + { + "category_id": "retail_order_post_shipment_service_request:delivered_order_exchange", + "category1": "retail_order_post_shipment_service_request", + "category2": "delivered_order_exchange", + "query_triggers": ["exchange_delivered_order_items", "exchange", "swap", "replacement", "delivered order exchange"], + "memory_triggers": ["exchange_delivered_order_items", "delivered order exchange", "post shipment exchange", "replacement"], + "negative_triggers": ["pending order", "cancel_pending_order"] + }, + { + "category_id": "retail_order_post_shipment_service_request:delivered_order_return", + "category1": "retail_order_post_shipment_service_request", + "category2": "delivered_order_return", + "query_triggers": ["return_delivered_order_items", "return", "refund", "delivered order return"], + "memory_triggers": ["return_delivered_order_items", "delivered order return", "partial return", "refund"], + "negative_triggers": ["pending order"] + }, + { + "category_id": "pending_order_item_modify:pending_item_modify", + "category1": "pending_order_item_modify", + "category2": "pending_item_modify", + "query_triggers": ["modify_pending_order_items", "pending order item", "cheapest variant", "change item", "modify item"], + "memory_triggers": ["modify_pending_order_items", "pending order item", "item modification", "cheapest variant"], + "negative_triggers": ["delivered order"] + }, + { + "category_id": "pending_address_modify:pending_order_address_modify", + "category1": "pending_address_modify", + "category2": "pending_order_address_modify", + "query_triggers": ["modify_pending_order_address", "shipping address", "delivery address", "pending order address"], + "memory_triggers": ["modify_pending_order_address", "shipping address", "pending order address", "address modification"], + "negative_triggers": ["delivered order return"] + }, + { + "category_id": "pending_order_payment_modify:pending_payment_modify", + "category1": "pending_order_payment_modify", + "category2": "pending_payment_modify", + "query_triggers": ["modify_pending_order_payment", "payment method", "change payment", "pending order payment"], + "memory_triggers": ["modify_pending_order_payment", "payment method", "pending payment"], + "negative_triggers": ["delivered order"] + }, + { + "category_id": "pending_order_cancellation:pending_cancel", + "category1": "pending_order_cancellation", + "category2": "pending_cancel", + "query_triggers": ["cancel_pending_order", "cancel pending", "cancel order", "pending cancellation"], + "memory_triggers": ["cancel_pending_order", "pending cancellation", "cancel pending order"], + "negative_triggers": ["delivered order return"] + }, + { + "category_id": "user_account_address_modify:profile_address_modify", + "category1": "user_account_address_modify", + "category2": "profile_address_modify", + "query_triggers": ["modify_user_address", "account address", "default shipping address", "profile address"], + "memory_triggers": ["modify_user_address", "account address", "profile address", "default shipping address"], + "negative_triggers": ["order item"] + } + ] + } + } +} diff --git a/benchmark/tau2/config/category_rerank.yaml b/benchmark/tau2/config/category_rerank.yaml new file mode 100644 index 0000000000..4028148014 --- /dev/null +++ b/benchmark/tau2/config/category_rerank.yaml @@ -0,0 +1,32 @@ +extends: trajectory.yaml + +benchmark: + name: tau2_openviking_trajectory_category_rerank + +strategies: + - id: memory_v2_trajectory_prewrite + label: OpenViking Memory V2 trajectory-view pre-write recall + memory_backend: openviking + train_required: true + corpus_id: memory_v2_trajectory_view + train_memory_mode: experience_only + search_memory_type: trajectories + retrieval_mode: first_user_prewrite + - id: memory_v2_trajectory_category_prewrite + label: OpenViking Memory V2 trajectory-view category-reranked pre-write recall + memory_backend: openviking + train_required: true + corpus_id: memory_v2_trajectory_view + train_memory_mode: experience_only + search_memory_type: trajectories + retrieval_mode: first_user_prewrite + category_rerank: + enabled: true + catalog_path: benchmark/tau2/config/category_catalog.json + apply_nodes: + - before_write_tool_call + retrieve_limit: 6 + inject_limit: 4 + positive_match_required: true + no_match_policy: skip_injection + search_score_weight: 1.0 diff --git a/benchmark/tau2/scripts/category_rerank.py b/benchmark/tau2/scripts/category_rerank.py new file mode 100644 index 0000000000..daa61de9c6 --- /dev/null +++ b/benchmark/tau2/scripts/category_rerank.py @@ -0,0 +1,457 @@ +from __future__ import annotations + +import json +import re +from dataclasses import dataclass +from pathlib import Path +from typing import Any + + +def _normalize(text: Any) -> str: + lowered = str(text or "").lower() + return re.sub(r"[^a-z0-9_]+", " ", lowered) + + +def _as_list(value: Any) -> list[str]: + if isinstance(value, str): + return [value] if value.strip() else [] + if isinstance(value, list): + return [str(item).strip() for item in value if str(item).strip()] + return [] + + +def _as_bool(value: Any, default: bool = False) -> bool: + if value is None: + return default + if isinstance(value, bool): + return value + return str(value).strip().lower() in {"1", "true", "yes", "on"} + + +def _as_int(value: Any, default: int) -> int: + try: + parsed = int(value) + except (TypeError, ValueError): + return default + return parsed if parsed > 0 else default + + +def _public_row(row: dict[str, Any]) -> dict[str, Any]: + return {key: value for key, value in row.items() if not key.startswith("_")} + + +def _score_value(value: Any) -> float: + try: + return float(value) + except (TypeError, ValueError): + return 0.0 + + +@dataclass(frozen=True) +class CategoryEntry: + category_id: str + category1: str + category2: str + query_triggers: tuple[str, ...] + memory_triggers: tuple[str, ...] + negative_triggers: tuple[str, ...] + + @classmethod + def from_payload(cls, payload: dict[str, Any]) -> "CategoryEntry | None": + category1 = str(payload.get("category1") or "").strip() + category2 = str(payload.get("category2") or "").strip() + category_id = str(payload.get("category_id") or "").strip() + if not category_id and category1: + category_id = category1 if not category2 else f"{category1}:{category2}" + if not category_id or not category1: + return None + query_triggers = tuple( + dict.fromkeys( + _as_list(payload.get("query_triggers")) + + _as_list(payload.get("triggers")) + + [category_id, category1, category2] + ) + ) + memory_triggers = tuple( + dict.fromkeys( + _as_list(payload.get("memory_triggers")) + + _as_list(payload.get("triggers")) + + [category_id, category1, category2] + ) + ) + return cls( + category_id=category_id, + category1=category1, + category2=category2, + query_triggers=query_triggers, + memory_triggers=memory_triggers, + negative_triggers=tuple(_as_list(payload.get("negative_triggers"))), + ) + + +class CategoryReranker: + def __init__( + self, + *, + enabled: bool, + apply_nodes: set[str], + catalog: dict[str, list[CategoryEntry]], + load_report: dict[str, Any], + retrieve_limit: int | None, + inject_limit: int | None, + positive_match_required: bool, + no_match_policy: str, + search_score_weight: float, + ) -> None: + self.enabled = enabled + self.apply_nodes = apply_nodes + self.catalog = catalog + self.load_report = load_report + self.retrieve_limit = retrieve_limit + self.inject_limit = inject_limit + self.positive_match_required = positive_match_required + self.no_match_policy = no_match_policy + self.search_score_weight = search_score_weight + + @classmethod + def from_payload(cls, payload: dict[str, Any] | None, *, repo_root: Path) -> "CategoryReranker": + payload = payload if isinstance(payload, dict) else {} + enabled = _as_bool(payload.get("enabled"), default=False) + apply_nodes = set(_as_list(payload.get("apply_nodes")) or ["before_write_tool_call"]) + if enabled: + catalog, load_report = _load_catalog(payload.get("catalog_path"), repo_root=repo_root) + if not load_report.get("loaded"): + raise ValueError(f"category rerank catalog failed to load: {load_report}") + else: + catalog = {} + load_report = { + "path": None, + "loaded": False, + "domain_count": 0, + "category_count": 0, + "errors": [], + } + return cls( + enabled=enabled, + apply_nodes=apply_nodes, + catalog=catalog, + load_report=load_report, + retrieve_limit=_as_int(payload.get("retrieve_limit"), 0) or None, + inject_limit=_as_int(payload.get("inject_limit"), 0) or None, + positive_match_required=_as_bool(payload.get("positive_match_required"), default=True), + no_match_policy=str(payload.get("no_match_policy") or "skip_injection"), + search_score_weight=float(payload.get("search_score_weight") or 1.0), + ) + + def search_limit(self, base_limit: int, *, decision_node: str) -> int: + if self.enabled and decision_node in self.apply_nodes and self.retrieve_limit: + return max(base_limit, self.retrieve_limit) + return base_limit + + def summary(self) -> dict[str, Any]: + return { + "enabled": self.enabled, + "apply_nodes": sorted(self.apply_nodes), + "retrieve_limit": self.retrieve_limit, + "inject_limit": self.inject_limit, + "positive_match_required": self.positive_match_required, + "no_match_policy": self.no_match_policy, + "catalog": self.load_report, + } + + def select( + self, + *, + domain: str, + query: str, + rows: list[dict[str, Any]], + decision_node: str, + base_limit: int, + ) -> tuple[list[dict[str, Any]], list[dict[str, Any]], dict[str, Any]]: + if not self.enabled or decision_node not in self.apply_nodes: + selected = rows[:base_limit] + trace_rows = _mark_selected(rows, selected, decision="base_rank") + diagnostics = { + "enabled": self.enabled, + "applied": False, + "decision_node": decision_node, + "decision": "node_not_enabled" if self.enabled else "disabled", + "apply_nodes": sorted(self.apply_nodes), + "raw_candidate_count": len(rows), + "selected_count": len(selected), + "catalog": self.load_report, + } + return selected, trace_rows, diagnostics + + domain_entries = self.catalog.get(str(domain).lower(), []) + query_annotation = _annotate_text( + domain_entries, + query, + trigger_field="query_triggers", + subject_type="query", + ) + scored = [] + candidates = [] + for index, row in enumerate(rows): + memory_text = "\n".join( + [ + str(row.get("uri") or ""), + str(row.get("_text") or ""), + str(row.get("level") or ""), + ] + ) + memory_annotation = _annotate_text( + domain_entries, + memory_text, + trigger_field="memory_triggers", + subject_type="memory", + ) + score, reasons, match_flags = _candidate_score( + query_annotation, + memory_annotation, + original_rank=index + 1, + original_score=_score_value(row.get("score")) * self.search_score_weight, + ) + candidate = { + "uri": row.get("uri"), + "raw_rank": index + 1, + "raw_score": row.get("score"), + "category_score": score, + "category_rerank_reasons": reasons, + "memory_category": memory_annotation, + **match_flags, + } + candidates.append(candidate) + scored.append((score, -index, row, candidate)) + + sorted_scored = sorted(scored, key=lambda item: (item[0], item[1]), reverse=True) + positive_level = "none" + if any(item[3].get("category2_match") for item in sorted_scored): + positive_level = "category2" + elif any(item[3].get("category1_match") for item in sorted_scored): + positive_level = "category1" + + decision = "soft_reranked" + filtered = sorted_scored + if self.positive_match_required: + if positive_level == "category2": + filtered = [item for item in sorted_scored if item[3].get("category2_match")] + decision = "positive_category2_match" + elif positive_level == "category1": + filtered = [item for item in sorted_scored if item[3].get("category1_match")] + decision = "positive_category1_match" + elif self.no_match_policy == "skip_injection": + filtered = [] + decision = "no_positive_category_match_skip_injection" + + inject_limit = self.inject_limit or base_limit + selected = [item[2] for item in filtered[:inject_limit]] + trace_rows = _mark_selected( + rows, + selected, + decision=decision, + kept_before_cap=[item[2] for item in filtered], + candidate_by_uri={ + str(candidate.get("uri") or ""): candidate for candidate in candidates + }, + ) + diagnostics = { + "enabled": True, + "applied": True, + "decision_node": decision_node, + "decision": decision, + "raw_candidate_count": len(rows), + "selected_count": len(selected), + "retrieve_limit": self.retrieve_limit, + "inject_limit": inject_limit, + "positive_match_required": self.positive_match_required, + "positive_match_level": positive_level, + "no_match_policy": self.no_match_policy, + "query_category": query_annotation, + "candidate_count": len(candidates), + "candidates": candidates, + "catalog": self.load_report, + } + return selected, trace_rows, diagnostics + + +def _load_catalog(raw_path: Any, *, repo_root: Path) -> tuple[dict[str, list[CategoryEntry]], dict[str, Any]]: + report = { + "path": None, + "loaded": False, + "domain_count": 0, + "category_count": 0, + "errors": [], + } + if not raw_path: + report["errors"].append("missing_catalog_path") + return {}, report + path = Path(str(raw_path)).expanduser() + if not path.is_absolute(): + path = repo_root / path + report["path"] = str(path) + if not path.is_file(): + report["errors"].append("catalog_file_not_found") + return {}, report + try: + payload = json.loads(path.read_text(encoding="utf-8")) + except Exception as exc: + report["errors"].append(f"{type(exc).__name__}: {exc}") + return {}, report + + domains = payload.get("domains") if isinstance(payload, dict) else {} + catalog: dict[str, list[CategoryEntry]] = {} + if not isinstance(domains, dict): + report["errors"].append("catalog_domains_must_be_mapping") + return {}, report + for domain, domain_payload in domains.items(): + raw_categories = domain_payload + if isinstance(domain_payload, dict): + raw_categories = domain_payload.get("categories") or [] + if not isinstance(raw_categories, list): + continue + entries = [] + for row in raw_categories: + if not isinstance(row, dict): + continue + entry = CategoryEntry.from_payload(row) + if entry: + entries.append(entry) + if entries: + catalog[str(domain).lower()] = entries + report["loaded"] = bool(catalog) + report["domain_count"] = len(catalog) + report["category_count"] = sum(len(entries) for entries in catalog.values()) + return catalog, report + + +def _annotate_text( + entries: list[CategoryEntry], + text: str, + *, + trigger_field: str, + subject_type: str, +) -> dict[str, Any]: + normalized = _normalize(text) + matches = [] + for entry in entries: + triggers = getattr(entry, trigger_field) + matched = [trigger for trigger in triggers if _normalize(trigger) in normalized] + negative = [ + trigger for trigger in entry.negative_triggers if _normalize(trigger) in normalized + ] + if not matched: + continue + score = len(matched) - (0.25 * len(negative)) + matches.append( + { + "category_id": entry.category_id, + "category1": entry.category1, + "category2": entry.category2, + "matched_triggers": matched[:8], + "negative_triggers": negative[:8], + "score": score, + } + ) + matches.sort(key=lambda item: item["score"], reverse=True) + category1 = list(dict.fromkeys(row["category1"] for row in matches if row["category1"])) + category2 = list(dict.fromkeys(row["category2"] for row in matches if row["category2"])) + primary = matches[0] if matches else None + return { + "subject_type": subject_type, + "category_source": "tau2_category_catalog_keyword_match", + "matched": bool(matches), + "primary_category_id": primary.get("category_id") if primary else None, + "category1": category1, + "category2": category2, + "confidence": min(1.0, 0.45 + 0.1 * len(matches)) if matches else 0.0, + "matches": matches[:5], + } + + +def _values(payload: dict[str, Any], key: str) -> set[str]: + value = payload.get(key) + if isinstance(value, str) and value.strip(): + return {value.strip()} + if isinstance(value, list): + return {str(item).strip() for item in value if str(item).strip()} + return set() + + +def _candidate_score( + query: dict[str, Any], + memory: dict[str, Any], + *, + original_rank: int, + original_score: float, +) -> tuple[float, list[str], dict[str, bool]]: + score = original_score - (original_rank * 0.001) + reasons = ["openviking_score", "raw_rank_tiebreak"] + query_c1 = _values(query, "category1") + query_c2 = _values(query, "category2") + memory_c1 = _values(memory, "category1") + memory_c2 = _values(memory, "category2") + category2_match = bool(query_c2 and memory_c2 and query_c2 & memory_c2) + category1_match = bool(query_c1 and memory_c1 and query_c1 & memory_c1) + if category2_match: + score += 100.0 + reasons.append("category2_match") + if category1_match: + score += 40.0 + reasons.append("category1_match") + if query_c2 and memory_c2 and not category2_match: + score -= 5.0 + reasons.append("category2_mismatch_downrank") + if query_c1 and memory_c1 and not category1_match: + score -= 20.0 + reasons.append("category1_mismatch_downrank") + if (query_c1 or query_c2) and not (memory_c1 or memory_c2): + score -= 2.0 + reasons.append("missing_memory_category") + return score, reasons, { + "category1_match": category1_match, + "category2_match": category2_match, + "category_explicit_mismatch": bool( + (query_c1 and memory_c1 and not category1_match) + or (query_c2 and memory_c2 and not category2_match) + ), + } + + +def _row_key(row: dict[str, Any]) -> str: + return str(row.get("uri") or row.get("memory_id") or id(row)) + + +def _mark_selected( + rows: list[dict[str, Any]], + selected_rows: list[dict[str, Any]], + *, + decision: str, + kept_before_cap: list[dict[str, Any]] | None = None, + candidate_by_uri: dict[str, dict[str, Any]] | None = None, +) -> list[dict[str, Any]]: + selected_keys = {_row_key(row) for row in selected_rows} + kept_keys = {_row_key(row) for row in (kept_before_cap or selected_rows)} + trace_rows = [] + for index, row in enumerate(rows, start=1): + key = _row_key(row) + traced = _public_row(row) + traced["raw_rank"] = index + traced["selected_for_injection"] = key in selected_keys + traced["injected"] = bool(traced["selected_for_injection"] and int(row.get("text_chars") or 0) > 0) + if not traced["selected_for_injection"]: + traced["skipped_reason"] = ( + "category_rerank_inject_limit" if key in kept_keys else "category_rerank" + ) + if decision == "no_positive_category_match_skip_injection": + traced["skipped_reason"] = "category_rerank_no_positive_match" + candidate = (candidate_by_uri or {}).get(str(row.get("uri") or "")) + if candidate: + traced["category_rerank_score"] = candidate.get("category_score") + traced["category_rerank_reasons"] = candidate.get("category_rerank_reasons") + traced["memory_category"] = candidate.get("memory_category") + traced["category1_match"] = candidate.get("category1_match") + traced["category2_match"] = candidate.get("category2_match") + traced["category_explicit_mismatch"] = candidate.get("category_explicit_mismatch") + trace_rows.append(traced) + return trace_rows diff --git a/benchmark/tau2/scripts/run_eval.py b/benchmark/tau2/scripts/run_eval.py index 7a38a34d41..ff4d4bf1d7 100755 --- a/benchmark/tau2/scripts/run_eval.py +++ b/benchmark/tau2/scripts/run_eval.py @@ -26,6 +26,9 @@ ) +REPO_ROOT = Path(__file__).resolve().parents[3] + + def _reward(sim: dict[str, Any]) -> float: info = sim.get("reward_info") or {} value = info.get("reward", sim.get("reward", 0.0)) @@ -98,6 +101,8 @@ def _tau2_command( f"Unsupported search_memory_type for {strategy['id']}: {search_memory_type}" ) search_uri = f"viking://agent/{agent_id}/memories/{search_memory_type}" + category_rerank = strategy.get("category_rerank") + category_rerank = category_rerank if isinstance(category_rerank, dict) else {} command = [ sys.executable, str(Path(__file__).with_name("run_memory_v2_eval.py")), @@ -146,6 +151,13 @@ def _tau2_command( "--seed", str(seed), ] + if category_rerank.get("enabled"): + command.extend( + [ + "--category-rerank-config", + json.dumps(category_rerank, ensure_ascii=False, sort_keys=True), + ] + ) if task_ids: for task_id in task_ids: command.extend(["--task-id", task_id]) @@ -265,6 +277,7 @@ def _build_plan( "corpus_id": strategy.get("corpus_id", strategy["id"]), "retrieval_mode": strategy.get("retrieval_mode"), "search_memory_type": strategy.get("search_memory_type", "experiences"), + "category_rerank": strategy.get("category_rerank") or {"enabled": False}, "adapter_status": strategy.get("adapter_status", "ready"), "executable": command is not None, "user_simulator_policy": user_simulator_policy(config), @@ -452,6 +465,31 @@ def _preflight(config: dict[str, Any], out: Path, *, strict: bool) -> int: if strict and not ok: errors.append(f"missing Python module: {module}") + category_rows = [] + for strategy in config.get("strategies") or []: + category_rerank = strategy.get("category_rerank") + if not isinstance(category_rerank, dict) or not category_rerank.get("enabled"): + continue + raw_catalog_path = category_rerank.get("catalog_path") + catalog_path = Path(str(raw_catalog_path or "")).expanduser() + if raw_catalog_path and not catalog_path.is_absolute(): + catalog_path = REPO_ROOT / catalog_path + exists = bool(raw_catalog_path and catalog_path.is_file()) + category_rows.append( + { + "strategy_id": strategy.get("id"), + "catalog_path": str(catalog_path) if raw_catalog_path else None, + "exists": exists, + "apply_nodes": category_rerank.get("apply_nodes"), + "retrieve_limit": category_rerank.get("retrieve_limit"), + "inject_limit": category_rerank.get("inject_limit"), + } + ) + if strict and not exists: + errors.append( + f"missing category rerank catalog for {strategy.get('id')}: {raw_catalog_path}" + ) + report = { "status": "failed" if errors else "ok", "strict": strict, @@ -461,6 +499,7 @@ def _preflight(config: dict[str, Any], out: Path, *, strict: bool) -> int: "domains": domains(config), "strategies": strategy_ids(config), "imports": import_rows, + "category_rerank_catalogs": category_rows, "split_files": split_rows, "errors": errors, } diff --git a/benchmark/tau2/scripts/run_memory_v2_eval.py b/benchmark/tau2/scripts/run_memory_v2_eval.py index 64384f4d60..b5859db546 100644 --- a/benchmark/tau2/scripts/run_memory_v2_eval.py +++ b/benchmark/tau2/scripts/run_memory_v2_eval.py @@ -9,6 +9,7 @@ from pathlib import Path from typing import Any +from category_rerank import CategoryReranker from tau2_common import normalize_litellm_env AGENT_NAME = "openviking_memory_agent" @@ -374,16 +375,24 @@ def get_init_state(self, message_history=None): ) return state - def _retrieve(self, query: str) -> tuple[str, list[dict[str, Any]]]: + def _retrieve( + self, + query: str, + *, + decision_node: str, + ) -> tuple[str, list[dict[str, Any]], dict[str, Any]]: client = _client(args) rows: list[dict[str, Any]] = [] try: + search_limit = args.category_reranker.search_limit( + args.retrieval_top_k, + decision_node=decision_node, + ) result = client.search( - query=query, target_uri=args.search_uri, limit=args.retrieval_top_k + query=query, target_uri=args.search_uri, limit=search_limit ) memories = list(getattr(result, "memories", []) or []) - blocks = [] - for index, match in enumerate(memories[: args.retrieval_top_k], 1): + for index, match in enumerate(memories[:search_limit], 1): uri = getattr(match, "uri", "") text, read_error = _read_memory_text(client, match) row = { @@ -391,13 +400,24 @@ def _retrieve(self, query: str) -> tuple[str, list[dict[str, Any]]]: "score": getattr(match, "score", None), "level": getattr(match, "level", None), "text_chars": len(text), + "_text": text, } if read_error: row["read_error"] = read_error rows.append(row) + selected_rows, trace_rows, category_rerank = args.category_reranker.select( + domain=args.domain, + query=query, + rows=rows, + decision_node=decision_node, + base_limit=args.retrieval_top_k, + ) + blocks = [] + for index, row in enumerate(selected_rows, 1): + text = str(row.get("_text") or "") if text.strip(): - blocks.append(f"Memory {index} ({uri}):\n{text.strip()}") - return "\n\n".join(blocks), rows + blocks.append(f"Memory {index} ({row.get('uri', '')}):\n{text.strip()}") + return "\n\n".join(blocks), trace_rows, category_rerank finally: client.close() @@ -407,7 +427,15 @@ def _trace(self, event: dict[str, Any]) -> None: @staticmethod def _trace_injection_fields(block: str, matches: list[dict[str, Any]]) -> dict[str, Any]: - injected_count = sum(1 for row in matches if int(row.get("text_chars") or 0) > 0) + injected_count = sum( + 1 + for row in matches + if row.get("injected") + or ( + row.get("selected_for_injection", True) + and int(row.get("text_chars") or 0) > 0 + ) + ) return { "injected": bool(block.strip()), "injected_count": injected_count if block.strip() else 0, @@ -494,7 +522,7 @@ def generate_next_message(self, message, state: LLMAgentState): role_value = getattr(role, "value", role) if marker_index is not None and str(role_value) == "user": query = str(getattr(message, "content", "") or "") - block, matches = self._retrieve(query) + block, matches, category_rerank = self._retrieve(query, decision_node="first_user") prompt = ( "No OpenViking memory matched this user request." if not block @@ -508,6 +536,7 @@ def generate_next_message(self, message, state: LLMAgentState): "query": query, "match_count": len(matches), "matches": matches, + "category_rerank": category_rerank, **self._trace_injection_fields(block, matches), } ) @@ -518,13 +547,17 @@ def generate_next_message(self, message, state: LLMAgentState): write_calls = [call for call in tool_calls if _is_write_tool_call(call)] if write_calls: query = _tool_call_query(write_calls, state.messages) - block, matches = self._retrieve(query) + block, matches, category_rerank = self._retrieve( + query, + decision_node="before_write_tool_call", + ) self._trace( { "decision_node": "before_write_tool_call", "query": query, "match_count": len(matches), "matches": matches, + "category_rerank": category_rerank, **self._trace_injection_fields(block, matches), "tool_calls": [ { @@ -600,9 +633,14 @@ def main() -> int: choices=["first_user", "prewrite", "first_user_prewrite"], default="first_user", ) + parser.add_argument("--category-rerank-config", type=_json, default={}) parser.add_argument("--force-train", action="store_true") args = parser.parse_args() normalize_litellm_env() + args.category_reranker = CategoryReranker.from_payload( + args.category_rerank_config, + repo_root=REPO_ROOT, + ) args.tau2_repo = args.tau2_repo.resolve() args.run_dir.mkdir(parents=True, exist_ok=True) @@ -642,6 +680,7 @@ def main() -> int: "retrieval_mode": args.retrieval_mode, "seed": args.seed, "corpus": corpus, + "category_rerank": args.category_reranker.summary(), "eval_results": str(eval_results), "retrieval_trace": str(trace_path), "metrics": _metrics(eval_results), From fddd7bab2665e9180a8d3a4a9326beb1e374a0b4 Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Wed, 13 May 2026 19:26:30 +0800 Subject: [PATCH 04/42] refine trajectory memory view prompt --- openviking/prompts/templates/memory/trajectories.yaml | 9 ++++++--- .../session/memory/agent_trajectory_context_provider.py | 2 ++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/openviking/prompts/templates/memory/trajectories.yaml b/openviking/prompts/templates/memory/trajectories.yaml index ebd52c1069..05889157ae 100644 --- a/openviking/prompts/templates/memory/trajectories.yaml +++ b/openviking/prompts/templates/memory/trajectories.yaml @@ -34,6 +34,8 @@ fields: description: | Procedure-like trajectory view in EXACTLY this format: + # + - Domain: - Trigger: - Preconditions: 1. @@ -48,19 +50,20 @@ fields: - - - Result: - - Evidence: + - Evidence: Rules: - Write for future agent execution, not for human audit. Prefer clear instructions over chronological narration. - Preserve the successful or best-known path: critical reads, required policy checks, confirmation steps, write-tool ordering, and final user-facing completion. - Keep negative lessons in Anti-patterns, not mixed into Procedure. - - Keep the memory grounded in this session, but abstract away user-specific names, raw IDs, exact payloads, and long tool responses unless they are essential to the reusable procedure. + - Keep the memory grounded in this session, but abstract away user-specific names, raw IDs, exact payloads, and long tool responses. + - Do not include raw user/order/reservation/payment/card IDs, user names, email/phone/address values, exact dates, case-specific amounts, card suffixes, flight numbers, order numbers, or raw tool payloads. Replace them with semantic descriptions such as "a delivered order", "an ineligible basic-economy reservation", "the saved payment method", or "a policy-ineligible cancellation". Stable policy constants may be kept when they are needed for future execution. - Mention tool names when they are part of the reusable path, but summarize observations instead of copying raw JSON. - If the session failed or was partial, still write the best reusable lesson: put the corrected approach in Procedure and the failure cause in Anti-patterns / Evidence. - Avoid broad SOPs. The Trigger and Applicability Boundary should make this record narrower than a whole domain workflow. General Rules: - - Use exactly the 7 labels above in this exact order. + - Use exactly the title, Domain line, and 7 labels above in this exact order. - Trigger, Result, and Evidence are ONE sentence each. - No extra headings, free paragraphs, or closing remarks. merge_op: patch diff --git a/openviking/session/memory/agent_trajectory_context_provider.py b/openviking/session/memory/agent_trajectory_context_provider.py index 6b1280b1d0..935ccbd4d3 100644 --- a/openviking/session/memory/agent_trajectory_context_provider.py +++ b/openviking/session/memory/agent_trajectory_context_provider.py @@ -33,6 +33,8 @@ def instruction(self) -> str: Write the record as a compact procedure-like view of the useful execution pattern, not as a raw transcript. Keep the future agent's decision points, tool path, confirmation/write boundary, failure corrections, and applicability boundary. +Generalize case evidence; do not copy raw user names, identifiers, dates, amounts, +payment details, or tool payloads into the reusable memory. Sub-tasks, pivots, errors, and follow-ups are folded into that one record as steps, guardrails, or evidence — not separate trajectories. From 7cd7acd96efdb5a96f6bc84906315182f3019af1 Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Wed, 13 May 2026 21:18:16 +0800 Subject: [PATCH 05/42] test(benchmark): cover tau2 category rerank helper --- tests/benchmark/test_tau2_category_rerank.py | 77 ++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 tests/benchmark/test_tau2_category_rerank.py diff --git a/tests/benchmark/test_tau2_category_rerank.py b/tests/benchmark/test_tau2_category_rerank.py new file mode 100644 index 0000000000..02ceb8aff0 --- /dev/null +++ b/tests/benchmark/test_tau2_category_rerank.py @@ -0,0 +1,77 @@ +from pathlib import Path + +from benchmark.tau2.scripts.category_rerank import CategoryReranker + + +def _reranker() -> CategoryReranker: + return CategoryReranker.from_payload( + { + "enabled": True, + "catalog_path": "benchmark/tau2/config/category_catalog.json", + "apply_nodes": ["before_write_tool_call"], + "retrieve_limit": 6, + "inject_limit": 4, + "positive_match_required": True, + "no_match_policy": "skip_injection", + }, + repo_root=Path(__file__).resolve().parents[2], + ) + + +def test_category_rerank_keeps_positive_category_match() -> None: + rows = [ + { + "uri": "viking://agent/demo/memories/trajectories/delivered_exchange.md", + "score": 0.25, + "_text": "Use exchange_delivered_order_items for a delivered order exchange replacement.", + }, + { + "uri": "viking://agent/demo/memories/trajectories/pending_cancel.md", + "score": 0.99, + "_text": "Use cancel_pending_order for a pending order cancellation.", + }, + ] + + selected, trace_rows, diagnostics = _reranker().select( + domain="retail", + query="I need to exchange_delivered_order_items for a delivered order replacement.", + rows=rows, + decision_node="before_write_tool_call", + base_limit=4, + ) + + assert diagnostics["applied"] is True + assert diagnostics["decision"] == "positive_category2_match" + assert diagnostics["positive_match_level"] == "category2" + assert diagnostics["query_category"]["primary_category_id"] == ( + "retail_order_post_shipment_service_request:delivered_order_exchange" + ) + assert [row["uri"] for row in selected] == [ + "viking://agent/demo/memories/trajectories/delivered_exchange.md" + ] + assert trace_rows[0]["selected_for_injection"] is True + assert trace_rows[1]["selected_for_injection"] is False + assert trace_rows[1]["skipped_reason"] == "category_rerank" + + +def test_category_rerank_skips_non_target_node() -> None: + rows = [ + {"uri": "viking://agent/demo/memories/trajectories/one.md", "score": 0.2}, + {"uri": "viking://agent/demo/memories/trajectories/two.md", "score": 0.1}, + ] + + selected, trace_rows, diagnostics = _reranker().select( + domain="retail", + query="exchange_delivered_order_items", + rows=rows, + decision_node="first_user", + base_limit=1, + ) + + assert diagnostics["applied"] is False + assert diagnostics["decision"] == "node_not_enabled" + assert [row["uri"] for row in selected] == [ + "viking://agent/demo/memories/trajectories/one.md" + ] + assert trace_rows[0]["selected_for_injection"] is True + assert trace_rows[1]["selected_for_injection"] is False From 04960009620c03a915471fe5929e0e8deceeb7b4 Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Wed, 13 May 2026 21:30:52 +0800 Subject: [PATCH 06/42] feat(benchmark): prepare tau2 memory corpora before eval --- benchmark/tau2/README.md | 5 +- benchmark/tau2/config/baseline.yaml | 1 + benchmark/tau2/scripts/run_eval.py | 66 ++++++++++++++++++++ benchmark/tau2/scripts/run_memory_v2_eval.py | 17 +++++ 4 files changed, 88 insertions(+), 1 deletion(-) diff --git a/benchmark/tau2/README.md b/benchmark/tau2/README.md index f53f7c99f1..6c542d9bcc 100644 --- a/benchmark/tau2/README.md +++ b/benchmark/tau2/README.md @@ -135,7 +135,10 @@ Memory V2 cells run through a small TAU-2 agent adapter in this directory: The existing `train_memory_mode: experience_only` value selects the Memory V2 session-commit path. `search_memory_type` selects which generated memory bucket is retrieved during eval (`experiences` by default, `trajectories` for -`config/trajectory.yaml`). +`config/trajectory.yaml`). The runner prepares each distinct +`domain + corpus_id` once before executing eval cells. Different corpora may be +prepared in parallel with `benchmark.corpus_prepare_concurrency`; session +commits inside one corpus remain serial to preserve OpenViking write semantics. ## User Simulator Policy diff --git a/benchmark/tau2/config/baseline.yaml b/benchmark/tau2/config/baseline.yaml index 4c4a5060e7..95541dcbd6 100644 --- a/benchmark/tau2/config/baseline.yaml +++ b/benchmark/tau2/config/baseline.yaml @@ -7,6 +7,7 @@ benchmark: eval_split_name: test repeat_count: 8 task_max_concurrency: 10 + corpus_prepare_concurrency: 2 max_steps: 200 seed: 300 agent: llm_agent diff --git a/benchmark/tau2/scripts/run_eval.py b/benchmark/tau2/scripts/run_eval.py index 7a38a34d41..4cdf9abc8e 100755 --- a/benchmark/tau2/scripts/run_eval.py +++ b/benchmark/tau2/scripts/run_eval.py @@ -6,6 +6,7 @@ import json import subprocess import sys +from concurrent.futures import ThreadPoolExecutor, as_completed from pathlib import Path from typing import Any @@ -286,6 +287,9 @@ def _build_plan( "cell_count": len(cells), "executable_cell_count": executable_cell_count, "pending_cell_count": len(cells) - executable_cell_count, + "corpus_prepare_concurrency": int( + config["benchmark"].get("corpus_prepare_concurrency", 1) + ), "cells": cells, } @@ -318,6 +322,67 @@ def _cell_metrics(cell: dict[str, Any], artifacts: dict[str, str]) -> dict[str, return _metrics_from_tau2_results(results_path) +def _memory_corpus_key(cell: dict[str, Any]) -> str: + corpus_id = str(cell.get("corpus_id") or cell["strategy_id"]) + return f"{cell['domain']}_{corpus_id}" + + +def _prepare_memory_corpus(cell: dict[str, Any], repo: Path, out: Path) -> dict[str, Any]: + key = _memory_corpus_key(cell) + command = list(cell["command"]) + ["--prepare-corpus-only"] + print(f"[tau2] preparing corpus {key}", flush=True) + completed = subprocess.run( + command, + cwd=repo, + text=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + check=False, + ) + corpus_id = str(cell.get("corpus_id") or cell["strategy_id"]) + row = { + "domain": cell["domain"], + "strategy_id": cell["strategy_id"], + "corpus_id": corpus_id, + "returncode": completed.returncode, + "stdout_tail": completed.stdout[-4000:], + "stderr_tail": completed.stderr[-4000:], + "artifacts": { + "corpus_manifest": str( + out / "memory_corpora" / f"{cell['domain']}_{corpus_id}" / "corpus_manifest.json" + ) + }, + } + write_json(out / "corpus_prepare_results" / f"{key}.json", row) + if completed.returncode != 0: + raise RuntimeError(f"corpus prepare failed: {key} returncode={completed.returncode}") + return row + + +def _prepare_memory_corpora(plan: dict[str, Any], repo: Path, out: Path) -> list[dict[str, Any]]: + corpus_cells: dict[str, dict[str, Any]] = {} + for cell in plan["cells"]: + if cell.get("memory_backend") != "openviking" or not cell.get("train_required"): + continue + corpus_cells.setdefault(_memory_corpus_key(cell), cell) + if not corpus_cells: + return [] + + worker_count = max(1, int(plan.get("corpus_prepare_concurrency") or 1)) + if worker_count == 1 or len(corpus_cells) == 1: + return [_prepare_memory_corpus(cell, repo, out) for cell in corpus_cells.values()] + + rows: list[dict[str, Any]] = [] + with ThreadPoolExecutor(max_workers=worker_count) as executor: + futures = { + executor.submit(_prepare_memory_corpus, cell, repo, out): key + for key, cell in corpus_cells.items() + } + for future in as_completed(futures): + rows.append(future.result()) + return rows + + def _summarize(rows: list[dict[str, Any]]) -> dict[str, Any]: def weighted(rows_for_group: list[dict[str, Any]]) -> dict[str, Any]: metric_rows = [row for row in rows_for_group if row.get("metrics")] @@ -382,6 +447,7 @@ def _execute_cells(plan: dict[str, Any], repo: Path, out: Path) -> list[dict[str "configured user simulator policy is not supported by this TAU-2 checkout: " f"{policy_report}" ) + _prepare_memory_corpora(plan, repo, out) rows = [] for cell in plan["cells"]: if not cell.get("executable"): diff --git a/benchmark/tau2/scripts/run_memory_v2_eval.py b/benchmark/tau2/scripts/run_memory_v2_eval.py index 64384f4d60..67e68d2b44 100644 --- a/benchmark/tau2/scripts/run_memory_v2_eval.py +++ b/benchmark/tau2/scripts/run_memory_v2_eval.py @@ -601,6 +601,7 @@ def main() -> int: default="first_user", ) parser.add_argument("--force-train", action="store_true") + parser.add_argument("--prepare-corpus-only", action="store_true") args = parser.parse_args() normalize_litellm_env() @@ -615,6 +616,22 @@ def main() -> int: summary_path = args.run_dir / f"{args.run_label}.summary.json" corpus = _train(args, train_results, corpus_manifest) + if args.prepare_corpus_only: + print( + json.dumps( + { + "run_label": args.run_label, + "domain": args.domain, + "strategy_id": args.strategy_id, + "prepare_corpus_only": True, + "corpus": corpus, + }, + ensure_ascii=False, + sort_keys=True, + ) + ) + return 0 + trace_path.touch() _register_memory_agent(args, trace_path) _run_tau2( From 8e3dc6075bf0df5cdd89178bea0245534c6eb40c Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Wed, 13 May 2026 21:40:28 +0800 Subject: [PATCH 07/42] align tau2 category rerank with harness baseline --- benchmark/tau2/README.md | 8 +- benchmark/tau2/config/category_rerank.yaml | 5 +- benchmark/tau2/scripts/category_rerank.py | 81 ++++++++++++++++++-- tests/benchmark/test_tau2_category_rerank.py | 14 +++- 4 files changed, 94 insertions(+), 14 deletions(-) diff --git a/benchmark/tau2/README.md b/benchmark/tau2/README.md index 0aac24e2aa..47f6326678 100644 --- a/benchmark/tau2/README.md +++ b/benchmark/tau2/README.md @@ -156,9 +156,11 @@ is retrieved during eval (`experiences` by default, `trajectories` for an adapter-local category rerank only at `before_write_tool_call`. The reranker loads `config/category_catalog.json`, annotates the runtime query and candidate memories from visible text/tool names/URIs, retrieves a wider candidate pool, -then injects only the top category-aligned memories. Retrieval traces include -the query category, candidate memory categories, rerank reasons, selected rows, -and skipped rows. +then follows the Agent Harness S83/S84 positive-match baseline: retrieve 6, +keep same-category candidates, inject at most 2, and skip injection when no +positive category match exists. Retrieval traces include the query category, +candidate memory categories, rerank reasons, selected rows, skipped rows, and +the flat `*_category*_prompt` fields consumed by Harness diagnostics. ## User Simulator Policy diff --git a/benchmark/tau2/config/category_rerank.yaml b/benchmark/tau2/config/category_rerank.yaml index 4028148014..28804d41bd 100644 --- a/benchmark/tau2/config/category_rerank.yaml +++ b/benchmark/tau2/config/category_rerank.yaml @@ -26,7 +26,8 @@ strategies: apply_nodes: - before_write_tool_call retrieve_limit: 6 - inject_limit: 4 + inject_limit: 2 + mismatch_policy: keep_positive_match_drop_mismatch positive_match_required: true no_match_policy: skip_injection - search_score_weight: 1.0 + search_score_weight: 0.0 diff --git a/benchmark/tau2/scripts/category_rerank.py b/benchmark/tau2/scripts/category_rerank.py index daa61de9c6..6045b4c696 100644 --- a/benchmark/tau2/scripts/category_rerank.py +++ b/benchmark/tau2/scripts/category_rerank.py @@ -99,6 +99,7 @@ def __init__( load_report: dict[str, Any], retrieve_limit: int | None, inject_limit: int | None, + mismatch_policy: str, positive_match_required: bool, no_match_policy: str, search_score_weight: float, @@ -109,6 +110,7 @@ def __init__( self.load_report = load_report self.retrieve_limit = retrieve_limit self.inject_limit = inject_limit + self.mismatch_policy = mismatch_policy self.positive_match_required = positive_match_required self.no_match_policy = no_match_policy self.search_score_weight = search_score_weight @@ -131,6 +133,13 @@ def from_payload(cls, payload: dict[str, Any] | None, *, repo_root: Path) -> "Ca "category_count": 0, "errors": [], } + mismatch_policy = str(payload.get("mismatch_policy") or "").strip() + positive_match_required = _as_bool( + payload.get("positive_match_required"), + default=mismatch_policy in {"keep_positive_match_drop_mismatch", "positive_match_only"}, + ) + if not mismatch_policy and positive_match_required: + mismatch_policy = "keep_positive_match_drop_mismatch" return cls( enabled=enabled, apply_nodes=apply_nodes, @@ -138,9 +147,10 @@ def from_payload(cls, payload: dict[str, Any] | None, *, repo_root: Path) -> "Ca load_report=load_report, retrieve_limit=_as_int(payload.get("retrieve_limit"), 0) or None, inject_limit=_as_int(payload.get("inject_limit"), 0) or None, - positive_match_required=_as_bool(payload.get("positive_match_required"), default=True), + mismatch_policy=mismatch_policy or "none", + positive_match_required=positive_match_required, no_match_policy=str(payload.get("no_match_policy") or "skip_injection"), - search_score_weight=float(payload.get("search_score_weight") or 1.0), + search_score_weight=float(payload.get("search_score_weight") or 0.0), ) def search_limit(self, base_limit: int, *, decision_node: str) -> int: @@ -154,8 +164,10 @@ def summary(self) -> dict[str, Any]: "apply_nodes": sorted(self.apply_nodes), "retrieve_limit": self.retrieve_limit, "inject_limit": self.inject_limit, + "mismatch_policy": self.mismatch_policy, "positive_match_required": self.positive_match_required, "no_match_policy": self.no_match_policy, + "search_score_weight": self.search_score_weight, "catalog": self.load_report, } @@ -179,7 +191,15 @@ def select( "apply_nodes": sorted(self.apply_nodes), "raw_candidate_count": len(rows), "selected_count": len(selected), + "retrieve_limit": self.retrieve_limit, + "inject_limit": self.inject_limit or base_limit, + "mismatch_policy": self.mismatch_policy, + "no_match_policy": self.no_match_policy, + "positive_match_required": self.positive_match_required, + "selection_policy": "score_sort", "catalog": self.load_report, + "loaded_files": _loaded_files(self.load_report), + "load_errors": self.load_report.get("errors") or [], } return selected, trace_rows, diagnostics @@ -233,16 +253,36 @@ def select( decision = "soft_reranked" filtered = sorted_scored - if self.positive_match_required: + dropped_mismatch_count = 0 + if self.mismatch_policy in {"keep_positive_match_drop_mismatch", "positive_match_only"}: + before_count = len(sorted_scored) if positive_level == "category2": filtered = [item for item in sorted_scored if item[3].get("category2_match")] - decision = "positive_category2_match" + decision = "soft_reranked_keep_category2_matches" elif positive_level == "category1": filtered = [item for item in sorted_scored if item[3].get("category1_match")] - decision = "positive_category1_match" + decision = "soft_reranked_keep_category1_matches" + elif self.no_match_policy == "skip_injection": + filtered = [] + decision = "no_positive_category_match_skip_injection" + dropped_mismatch_count = before_count - len(filtered) + elif self.mismatch_policy == "drop_when_match_available": + has_positive_match = positive_level in {"category1", "category2"} + if has_positive_match: + before_count = len(sorted_scored) + guarded = [item for item in sorted_scored if not item[3].get("category_explicit_mismatch")] + if guarded: + filtered = guarded + decision = "soft_reranked_with_mismatch_guard" + dropped_mismatch_count = before_count - len(filtered) elif self.no_match_policy == "skip_injection": + dropped_mismatch_count = len(sorted_scored) filtered = [] decision = "no_positive_category_match_skip_injection" + elif self.no_match_policy == "skip_injection" and positive_level == "none": + dropped_mismatch_count = len(sorted_scored) + filtered = [] + decision = "no_positive_category_match_skip_injection" inject_limit = self.inject_limit or base_limit selected = [item[2] for item in filtered[:inject_limit]] @@ -254,6 +294,7 @@ def select( candidate_by_uri={ str(candidate.get("uri") or ""): candidate for candidate in candidates }, + query_category=query_annotation, ) diagnostics = { "enabled": True, @@ -264,17 +305,29 @@ def select( "selected_count": len(selected), "retrieve_limit": self.retrieve_limit, "inject_limit": inject_limit, + "mismatch_policy": self.mismatch_policy, "positive_match_required": self.positive_match_required, "positive_match_level": positive_level, "no_match_policy": self.no_match_policy, + "selection_policy": "score_sort", + "dropped_mismatch_count": dropped_mismatch_count, + "kept_before_cap_ids": [str(item[2].get("uri") or "") for item in filtered], "query_category": query_annotation, "candidate_count": len(candidates), "candidates": candidates, "catalog": self.load_report, + "loaded_files": _loaded_files(self.load_report), + "load_errors": self.load_report.get("errors") or [], } return selected, trace_rows, diagnostics +def _loaded_files(load_report: dict[str, Any]) -> list[str]: + if load_report.get("loaded") and load_report.get("path"): + return [str(load_report["path"])] + return [] + + def _load_catalog(raw_path: Any, *, repo_root: Path) -> tuple[dict[str, list[CategoryEntry]], dict[str, Any]]: report = { "path": None, @@ -386,7 +439,9 @@ def _candidate_score( original_score: float, ) -> tuple[float, list[str], dict[str, bool]]: score = original_score - (original_rank * 0.001) - reasons = ["openviking_score", "raw_rank_tiebreak"] + reasons = ["original_rank_tiebreak"] + if original_score: + reasons.insert(0, "openviking_score") query_c1 = _values(query, "category1") query_c2 = _values(query, "category2") memory_c1 = _values(memory, "category1") @@ -429,6 +484,7 @@ def _mark_selected( decision: str, kept_before_cap: list[dict[str, Any]] | None = None, candidate_by_uri: dict[str, dict[str, Any]] | None = None, + query_category: dict[str, Any] | None = None, ) -> list[dict[str, Any]]: selected_keys = {_row_key(row) for row in selected_rows} kept_keys = {_row_key(row) for row in (kept_before_cap or selected_rows)} @@ -447,9 +503,20 @@ def _mark_selected( traced["skipped_reason"] = "category_rerank_no_positive_match" candidate = (candidate_by_uri or {}).get(str(row.get("uri") or "")) if candidate: + memory_category = candidate.get("memory_category") + memory_category = memory_category if isinstance(memory_category, dict) else {} traced["category_rerank_score"] = candidate.get("category_score") traced["category_rerank_reasons"] = candidate.get("category_rerank_reasons") - traced["memory_category"] = candidate.get("memory_category") + traced["memory_category"] = memory_category + traced["memory_category1_prompt"] = memory_category.get("category1") + traced["memory_category2_prompt"] = memory_category.get("category2") + traced["memory_category_source_prompt"] = memory_category.get("category_source") + traced["memory_category_confidence_prompt"] = memory_category.get("confidence") + if query_category: + traced["query_category1_prompt"] = query_category.get("category1") + traced["query_category2_prompt"] = query_category.get("category2") + traced["query_category_source_prompt"] = query_category.get("category_source") + traced["query_category_confidence_prompt"] = query_category.get("confidence") traced["category1_match"] = candidate.get("category1_match") traced["category2_match"] = candidate.get("category2_match") traced["category_explicit_mismatch"] = candidate.get("category_explicit_mismatch") diff --git a/tests/benchmark/test_tau2_category_rerank.py b/tests/benchmark/test_tau2_category_rerank.py index 02ceb8aff0..d933e4c9d1 100644 --- a/tests/benchmark/test_tau2_category_rerank.py +++ b/tests/benchmark/test_tau2_category_rerank.py @@ -10,9 +10,11 @@ def _reranker() -> CategoryReranker: "catalog_path": "benchmark/tau2/config/category_catalog.json", "apply_nodes": ["before_write_tool_call"], "retrieve_limit": 6, - "inject_limit": 4, + "inject_limit": 2, + "mismatch_policy": "keep_positive_match_drop_mismatch", "positive_match_required": True, "no_match_policy": "skip_injection", + "search_score_weight": 0.0, }, repo_root=Path(__file__).resolve().parents[2], ) @@ -41,8 +43,10 @@ def test_category_rerank_keeps_positive_category_match() -> None: ) assert diagnostics["applied"] is True - assert diagnostics["decision"] == "positive_category2_match" + assert diagnostics["decision"] == "soft_reranked_keep_category2_matches" + assert diagnostics["mismatch_policy"] == "keep_positive_match_drop_mismatch" assert diagnostics["positive_match_level"] == "category2" + assert diagnostics["inject_limit"] == 2 assert diagnostics["query_category"]["primary_category_id"] == ( "retail_order_post_shipment_service_request:delivered_order_exchange" ) @@ -50,6 +54,12 @@ def test_category_rerank_keeps_positive_category_match() -> None: "viking://agent/demo/memories/trajectories/delivered_exchange.md" ] assert trace_rows[0]["selected_for_injection"] is True + assert trace_rows[0]["query_category1_prompt"] == [ + "retail_order_post_shipment_service_request" + ] + assert trace_rows[0]["memory_category1_prompt"] == [ + "retail_order_post_shipment_service_request" + ] assert trace_rows[1]["selected_for_injection"] is False assert trace_rows[1]["skipped_reason"] == "category_rerank" From f7b3815ebd46c13597487f48ea926c48893b5b76 Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Wed, 13 May 2026 22:10:34 +0800 Subject: [PATCH 08/42] bench(tau2): align category rerank with FGMemory route --- benchmark/tau2/README.md | 26 ++--- benchmark/tau2/config/category_rerank.yaml | 9 +- .../retail_same_order_variant_guard.md | 9 ++ benchmark/tau2/scripts/run_eval.py | 45 ++++++++ benchmark/tau2/scripts/run_memory_v2_eval.py | 102 +++++++++++++++++- tests/benchmark/test_tau2_category_rerank.py | 32 ++++++ 6 files changed, 207 insertions(+), 16 deletions(-) create mode 100644 benchmark/tau2/config/scope_prompts/retail_same_order_variant_guard.md diff --git a/benchmark/tau2/README.md b/benchmark/tau2/README.md index 47f6326678..ea27e8871a 100644 --- a/benchmark/tau2/README.md +++ b/benchmark/tau2/README.md @@ -6,10 +6,10 @@ evaluation. The scope is intentionally narrow: - fresh OpenViking Memory V2 experience-only baseline; - Memory V2 pre-write recall treatment. - trajectory-view retrieval treatment for the refined trajectory prompt. -- experimental category-reranked pre-write recall on top of trajectory-view memory. +- experimental FGMemory-style pre-write recall on top of trajectory-view memory. -Category rerank is opt-in and experimental; it is meant for PR-C review and -smoke/targeted probes before any productization decision. +The FGMemory-style route is opt-in and experimental; it is meant for PR-C +review and smoke/targeted probes before any productization decision. ## Layout @@ -93,7 +93,7 @@ benchmark/tau2/run_full_eval.sh \ --repeat-count 1 ``` -Plan a one-cell trajectory category-rerank smoke: +Plan a one-cell trajectory FGMemory-style category-rerank smoke: ```bash benchmark/tau2/run_full_eval.sh \ @@ -153,14 +153,16 @@ is retrieved during eval (`experiences` by default, `trajectories` for `config/trajectory.yaml`). `config/category_rerank.yaml` keeps the PR-B trajectory memory route and enables -an adapter-local category rerank only at `before_write_tool_call`. The reranker -loads `config/category_catalog.json`, annotates the runtime query and candidate -memories from visible text/tool names/URIs, retrieves a wider candidate pool, -then follows the Agent Harness S83/S84 positive-match baseline: retrieve 6, -keep same-category candidates, inject at most 2, and skip injection when no -positive category match exists. Retrieval traces include the query category, -candidate memory categories, rerank reasons, selected rows, skipped rows, and -the flat `*_category*_prompt` fields consumed by Harness diagnostics. +an adapter-local FGMemory-style probe: pre-write recall, annotation category +rerank, and the retail scope prompt used by the Harness High-TrajView/FGMemory +route. The category sub-policy follows the S84 component settings, but the +alignment target is the red-box S89/FGMemory high result: retrieve 6, keep +same-category candidates, inject at most 2, skip injection when no positive +category match exists, and apply the scope/applicability prompt at the system +prompt injection point. Retrieval traces include the query category, candidate +memory categories, rerank reasons, selected rows, skipped rows, scope prompt +metadata, and the flat `*_category*_prompt` fields consumed by Harness +diagnostics. ## User Simulator Policy diff --git a/benchmark/tau2/config/category_rerank.yaml b/benchmark/tau2/config/category_rerank.yaml index 28804d41bd..86cde2aff4 100644 --- a/benchmark/tau2/config/category_rerank.yaml +++ b/benchmark/tau2/config/category_rerank.yaml @@ -1,7 +1,7 @@ extends: trajectory.yaml benchmark: - name: tau2_openviking_trajectory_category_rerank + name: tau2_openviking_fgmemory_category_rerank strategies: - id: memory_v2_trajectory_prewrite @@ -13,7 +13,7 @@ strategies: search_memory_type: trajectories retrieval_mode: first_user_prewrite - id: memory_v2_trajectory_category_prewrite - label: OpenViking Memory V2 trajectory-view category-reranked pre-write recall + label: OpenViking Memory V2 trajectory-view FGMemory-style pre-write recall memory_backend: openviking train_required: true corpus_id: memory_v2_trajectory_view @@ -31,3 +31,8 @@ strategies: positive_match_required: true no_match_policy: skip_injection search_score_weight: 0.0 + scope_prompt: + enabled: true + injection_point: system_prompt + domain_files: + retail: benchmark/tau2/config/scope_prompts/retail_same_order_variant_guard.md diff --git a/benchmark/tau2/config/scope_prompts/retail_same_order_variant_guard.md b/benchmark/tau2/config/scope_prompts/retail_same_order_variant_guard.md new file mode 100644 index 0000000000..2884ba0abb --- /dev/null +++ b/benchmark/tau2/config/scope_prompts/retail_same_order_variant_guard.md @@ -0,0 +1,9 @@ + +Retail exchange and modification memories are advisory. Do not broaden the user's requested replacement scope. + +- If the user says the replacement should come from the same order, the rest of that order, or an item already in that order, choose only among items visible in the current order details. +- In that case, do not call product-catalog variant lookup to find a cheaper or more available variant unless the user explicitly asks for the cheapest available variant of the product. +- If a procedure memory says to fetch all product variants but the user's wording restricts the candidate set to observed order items, follow the user's narrower scope. +- Before write tools, the new item id must be grounded in the current order observations or in the user's explicit requested catalog variant. +- Do not treat "user provided the order id" in a memory as mandatory. If the user has authenticated but does not know the order id, use current tools to retrieve the user's order list and inspect likely orders instead of stopping or repeatedly asking for the order id. + diff --git a/benchmark/tau2/scripts/run_eval.py b/benchmark/tau2/scripts/run_eval.py index ff4d4bf1d7..1d4c348607 100755 --- a/benchmark/tau2/scripts/run_eval.py +++ b/benchmark/tau2/scripts/run_eval.py @@ -103,6 +103,8 @@ def _tau2_command( search_uri = f"viking://agent/{agent_id}/memories/{search_memory_type}" category_rerank = strategy.get("category_rerank") category_rerank = category_rerank if isinstance(category_rerank, dict) else {} + scope_prompt = strategy.get("scope_prompt") + scope_prompt = scope_prompt if isinstance(scope_prompt, dict) else {} command = [ sys.executable, str(Path(__file__).with_name("run_memory_v2_eval.py")), @@ -158,6 +160,13 @@ def _tau2_command( json.dumps(category_rerank, ensure_ascii=False, sort_keys=True), ] ) + if scope_prompt.get("enabled"): + command.extend( + [ + "--scope-prompt-config", + json.dumps(scope_prompt, ensure_ascii=False, sort_keys=True), + ] + ) if task_ids: for task_id in task_ids: command.extend(["--task-id", task_id]) @@ -278,6 +287,7 @@ def _build_plan( "retrieval_mode": strategy.get("retrieval_mode"), "search_memory_type": strategy.get("search_memory_type", "experiences"), "category_rerank": strategy.get("category_rerank") or {"enabled": False}, + "scope_prompt": strategy.get("scope_prompt") or {"enabled": False}, "adapter_status": strategy.get("adapter_status", "ready"), "executable": command is not None, "user_simulator_policy": user_simulator_policy(config), @@ -490,6 +500,40 @@ def _preflight(config: dict[str, Any], out: Path, *, strict: bool) -> int: f"missing category rerank catalog for {strategy.get('id')}: {raw_catalog_path}" ) + scope_prompt_rows = [] + for strategy in config.get("strategies") or []: + scope_prompt = strategy.get("scope_prompt") + if not isinstance(scope_prompt, dict) or not scope_prompt.get("enabled"): + continue + domain_files = scope_prompt.get("domain_files") + domain_files = domain_files if isinstance(domain_files, dict) else {} + domain_texts = scope_prompt.get("domain_texts") + domain_texts = domain_texts if isinstance(domain_texts, dict) else {} + for domain in domains(config): + raw_prompt_path = domain_files.get(domain) + prompt_path = None + exists = False + if raw_prompt_path: + prompt_path = Path(str(raw_prompt_path)).expanduser() + if not prompt_path.is_absolute(): + prompt_path = REPO_ROOT / prompt_path + exists = prompt_path.is_file() + if strict and not exists: + errors.append( + f"missing scope prompt file for {strategy.get('id')} {domain}: " + f"{raw_prompt_path}" + ) + scope_prompt_rows.append( + { + "strategy_id": strategy.get("id"), + "domain": domain, + "configured": bool(raw_prompt_path or domain_texts.get(domain)), + "prompt_path": str(prompt_path) if prompt_path else None, + "exists": exists, + "injection_point": scope_prompt.get("injection_point", "system_prompt"), + } + ) + report = { "status": "failed" if errors else "ok", "strict": strict, @@ -500,6 +544,7 @@ def _preflight(config: dict[str, Any], out: Path, *, strict: bool) -> int: "strategies": strategy_ids(config), "imports": import_rows, "category_rerank_catalogs": category_rows, + "scope_prompts": scope_prompt_rows, "split_files": split_rows, "errors": errors, } diff --git a/benchmark/tau2/scripts/run_memory_v2_eval.py b/benchmark/tau2/scripts/run_memory_v2_eval.py index b5859db546..f872441ed3 100644 --- a/benchmark/tau2/scripts/run_memory_v2_eval.py +++ b/benchmark/tau2/scripts/run_memory_v2_eval.py @@ -9,8 +9,15 @@ from pathlib import Path from typing import Any -from category_rerank import CategoryReranker -from tau2_common import normalize_litellm_env +try: + from category_rerank import CategoryReranker +except ModuleNotFoundError: # pragma: no cover - package import path + from .category_rerank import CategoryReranker + +try: + from tau2_common import normalize_litellm_env +except ModuleNotFoundError: # pragma: no cover - package import path + from .tau2_common import normalize_litellm_env AGENT_NAME = "openviking_memory_agent" REPO_ROOT = Path(__file__).resolve().parents[3] @@ -35,6 +42,14 @@ def _json(text: str) -> dict[str, Any]: return json.loads(text) if text else {} +def _as_bool(value: Any, default: bool = False) -> bool: + if value is None: + return default + if isinstance(value, bool): + return value + return str(value).strip().lower() in {"1", "true", "yes", "on"} + + def _write_json(path: Path, payload: Any) -> None: path.parent.mkdir(parents=True, exist_ok=True) path.write_text(json.dumps(payload, ensure_ascii=False, indent=2, sort_keys=True) + "\n") @@ -57,6 +72,70 @@ def _compat_results_path(path: Path) -> Path: return run_dir / "results.json" +def _resolve_repo_path(raw_path: Any, *, repo_root: Path) -> Path: + path = Path(str(raw_path)).expanduser() + if not path.is_absolute(): + path = repo_root / path + return path + + +def _domain_value(mapping: Any, domain: str) -> Any: + if isinstance(mapping, dict): + return mapping.get(domain) or mapping.get(str(domain).lower()) + return None + + +def _load_scope_prompt( + payload: dict[str, Any] | None, + *, + domain: str, + repo_root: Path, +) -> tuple[str, dict[str, Any]]: + payload = payload if isinstance(payload, dict) else {} + enabled = _as_bool(payload.get("enabled"), default=False) + summary: dict[str, Any] = { + "enabled": enabled, + "domain": domain, + "injection_point": str(payload.get("injection_point") or "system_prompt"), + "loaded": False, + "loaded_files": [], + "text_chars": 0, + } + if not enabled: + summary["skipped_reason"] = "disabled" + return "", summary + + text = str(_domain_value(payload.get("domain_texts"), domain) or "").strip() + raw_path = _domain_value(payload.get("domain_files"), domain) + if raw_path: + path = _resolve_repo_path(raw_path, repo_root=repo_root) + summary["loaded_files"] = [str(path)] + if not path.is_file(): + raise FileNotFoundError(f"scope prompt file not found for {domain}: {path}") + text = path.read_text(encoding="utf-8").strip() + + if not text: + summary["skipped_reason"] = "no_domain_scope_prompt" + return "", summary + + summary["loaded"] = True + summary["text_chars"] = len(text) + return text, summary + + +def _scope_prompt_text(prompt: str) -> str: + if not prompt.strip(): + return "" + return ( + "Use this OpenViking memory applicability guard together with retrieved " + "memories. Current tool observations and the current user request remain " + "authoritative.\n\n" + "\n" + f"{prompt.strip()}\n" + "" + ) + + def _reward(sim: dict[str, Any]) -> float: info = sim.get("reward_info") or {} value = info.get("reward", sim.get("reward", 0.0)) @@ -369,6 +448,18 @@ def _register_memory_agent(args: argparse.Namespace, trace_path: Path) -> None: class OpenVikingMemoryAgent(LLMAgent): def get_init_state(self, message_history=None): state = super().get_init_state(message_history) + scope_prompt = _scope_prompt_text(args.scope_prompt_text) + if scope_prompt: + state.system_messages.append(SystemMessage(role="system", content=scope_prompt)) + self._trace( + { + "decision_node": "static_scope_prompt", + "retrieval_action_taken": "scope_prompt_static_injection", + "scope_prompt": args.scope_prompt_summary, + "injected": True, + "injected_count": 1, + } + ) if args.retrieval_mode in {"first_user", "first_user_prewrite"}: state.system_messages.append( SystemMessage(role="system", content="") @@ -634,6 +725,7 @@ def main() -> int: default="first_user", ) parser.add_argument("--category-rerank-config", type=_json, default={}) + parser.add_argument("--scope-prompt-config", type=_json, default={}) parser.add_argument("--force-train", action="store_true") args = parser.parse_args() normalize_litellm_env() @@ -641,6 +733,11 @@ def main() -> int: args.category_rerank_config, repo_root=REPO_ROOT, ) + args.scope_prompt_text, args.scope_prompt_summary = _load_scope_prompt( + args.scope_prompt_config, + domain=args.domain, + repo_root=REPO_ROOT, + ) args.tau2_repo = args.tau2_repo.resolve() args.run_dir.mkdir(parents=True, exist_ok=True) @@ -681,6 +778,7 @@ def main() -> int: "seed": args.seed, "corpus": corpus, "category_rerank": args.category_reranker.summary(), + "scope_prompt": args.scope_prompt_summary, "eval_results": str(eval_results), "retrieval_trace": str(trace_path), "metrics": _metrics(eval_results), diff --git a/tests/benchmark/test_tau2_category_rerank.py b/tests/benchmark/test_tau2_category_rerank.py index d933e4c9d1..14d8693ead 100644 --- a/tests/benchmark/test_tau2_category_rerank.py +++ b/tests/benchmark/test_tau2_category_rerank.py @@ -1,6 +1,7 @@ from pathlib import Path from benchmark.tau2.scripts.category_rerank import CategoryReranker +from benchmark.tau2.scripts.run_memory_v2_eval import _load_scope_prompt def _reranker() -> CategoryReranker: @@ -85,3 +86,34 @@ def test_category_rerank_skips_non_target_node() -> None: ] assert trace_rows[0]["selected_for_injection"] is True assert trace_rows[1]["selected_for_injection"] is False + + +def test_scope_prompt_loads_domain_file(tmp_path: Path) -> None: + prompt = tmp_path / "retail_scope.md" + prompt.write_text("same order") + + text, summary = _load_scope_prompt( + {"enabled": True, "domain_files": {"retail": str(prompt)}}, + domain="retail", + repo_root=Path(__file__).resolve().parents[2], + ) + + assert "same order" in text + assert summary["enabled"] is True + assert summary["loaded"] is True + assert summary["loaded_files"] == [str(prompt)] + + +def test_scope_prompt_skips_unconfigured_domain(tmp_path: Path) -> None: + prompt = tmp_path / "retail_scope.md" + prompt.write_text("retail only") + + text, summary = _load_scope_prompt( + {"enabled": True, "domain_files": {"retail": str(prompt)}}, + domain="airline", + repo_root=Path(__file__).resolve().parents[2], + ) + + assert text == "" + assert summary["loaded"] is False + assert summary["skipped_reason"] == "no_domain_scope_prompt" From 08d33a933644e7cadbebe0662d064b54967db393 Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Wed, 13 May 2026 22:17:39 +0800 Subject: [PATCH 09/42] fix(benchmark): tighten trajectory evidence prompt --- benchmark/tau2/scripts/run_eval.py | 4 +--- openviking/prompts/templates/memory/trajectories.yaml | 3 ++- .../session/memory/agent_trajectory_context_provider.py | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/benchmark/tau2/scripts/run_eval.py b/benchmark/tau2/scripts/run_eval.py index 4cdf9abc8e..f2c250a247 100755 --- a/benchmark/tau2/scripts/run_eval.py +++ b/benchmark/tau2/scripts/run_eval.py @@ -287,9 +287,7 @@ def _build_plan( "cell_count": len(cells), "executable_cell_count": executable_cell_count, "pending_cell_count": len(cells) - executable_cell_count, - "corpus_prepare_concurrency": int( - config["benchmark"].get("corpus_prepare_concurrency", 1) - ), + "corpus_prepare_concurrency": int(config["benchmark"].get("corpus_prepare_concurrency", 1)), "cells": cells, } diff --git a/openviking/prompts/templates/memory/trajectories.yaml b/openviking/prompts/templates/memory/trajectories.yaml index 05889157ae..a16b7485f9 100644 --- a/openviking/prompts/templates/memory/trajectories.yaml +++ b/openviking/prompts/templates/memory/trajectories.yaml @@ -57,7 +57,8 @@ fields: - Preserve the successful or best-known path: critical reads, required policy checks, confirmation steps, write-tool ordering, and final user-facing completion. - Keep negative lessons in Anti-patterns, not mixed into Procedure. - Keep the memory grounded in this session, but abstract away user-specific names, raw IDs, exact payloads, and long tool responses. - - Do not include raw user/order/reservation/payment/card IDs, user names, email/phone/address values, exact dates, case-specific amounts, card suffixes, flight numbers, order numbers, or raw tool payloads. Replace them with semantic descriptions such as "a delivered order", "an ineligible basic-economy reservation", "the saved payment method", or "a policy-ineligible cancellation". Stable policy constants may be kept when they are needed for future execution. + - Do not include raw user/order/reservation/payment/card IDs, user names, email/phone/address values, exact dates, case-specific amounts, exact budgets, route pairs, airport pairs, passenger/bag counts, card suffixes, flight numbers, order numbers, or raw tool payloads. Replace them with semantic descriptions such as "a delivered order", "an ineligible basic-economy reservation", "the saved payment method", or "a policy-ineligible cancellation". Stable policy constants may be kept when they are needed for future execution. + - The Evidence line is not an exception: it may name the source status and lesson type, but must not carry case-specific values such as exact amounts, routes, dates, counts, product names, or customer-specific state. - Mention tool names when they are part of the reusable path, but summarize observations instead of copying raw JSON. - If the session failed or was partial, still write the best reusable lesson: put the corrected approach in Procedure and the failure cause in Anti-patterns / Evidence. - Avoid broad SOPs. The Trigger and Applicability Boundary should make this record narrower than a whole domain workflow. diff --git a/openviking/session/memory/agent_trajectory_context_provider.py b/openviking/session/memory/agent_trajectory_context_provider.py index 935ccbd4d3..1f9f3d25c0 100644 --- a/openviking/session/memory/agent_trajectory_context_provider.py +++ b/openviking/session/memory/agent_trajectory_context_provider.py @@ -34,7 +34,7 @@ def instruction(self) -> str: not as a raw transcript. Keep the future agent's decision points, tool path, confirmation/write boundary, failure corrections, and applicability boundary. Generalize case evidence; do not copy raw user names, identifiers, dates, amounts, -payment details, or tool payloads into the reusable memory. +exact budgets, routes, counts, payment details, or tool payloads into the reusable memory. Sub-tasks, pivots, errors, and follow-ups are folded into that one record as steps, guardrails, or evidence — not separate trajectories. From 1bdc6a939eace190f835d226cd2bd985258e00fb Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Wed, 13 May 2026 23:25:58 +0800 Subject: [PATCH 10/42] bench(tau2): resolve memory eval artifact paths --- benchmark/tau2/scripts/run_memory_v2_eval.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/benchmark/tau2/scripts/run_memory_v2_eval.py b/benchmark/tau2/scripts/run_memory_v2_eval.py index f08a478715..ff60079f2b 100644 --- a/benchmark/tau2/scripts/run_memory_v2_eval.py +++ b/benchmark/tau2/scripts/run_memory_v2_eval.py @@ -740,7 +740,10 @@ def main() -> int: repo_root=REPO_ROOT, ) - args.tau2_repo = args.tau2_repo.resolve() + args.tau2_repo = args.tau2_repo.expanduser().resolve() + args.run_dir = args.run_dir.expanduser().resolve() + if args.corpus_dir: + args.corpus_dir = args.corpus_dir.expanduser().resolve() args.run_dir.mkdir(parents=True, exist_ok=True) corpus_dir = args.corpus_dir or args.run_dir corpus_dir.mkdir(parents=True, exist_ok=True) From 9cfe362721cead6f5eaac7e0b6d5a3ada6580682 Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Wed, 13 May 2026 23:50:30 +0800 Subject: [PATCH 11/42] fix(benchmark): guard tau2 infrastructure failures --- benchmark/tau2/scripts/run_eval.py | 2 + benchmark/tau2/scripts/run_memory_v2_eval.py | 30 ++++++++++++- benchmark/tau2/scripts/tau2_common.py | 47 ++++++++++++++++---- 3 files changed, 69 insertions(+), 10 deletions(-) diff --git a/benchmark/tau2/scripts/run_eval.py b/benchmark/tau2/scripts/run_eval.py index f2c250a247..2fec4cbf0e 100755 --- a/benchmark/tau2/scripts/run_eval.py +++ b/benchmark/tau2/scripts/run_eval.py @@ -11,6 +11,7 @@ from typing import Any from tau2_common import ( + assert_tau2_results_complete, domains, load_config, normalize_litellm_env, @@ -49,6 +50,7 @@ def _db_match(sim: dict[str, Any]) -> bool | None: def _metrics_from_tau2_results(results_path: Path) -> dict[str, Any]: data = json.loads(results_path.read_text(encoding="utf-8")) + assert_tau2_results_complete(data, context=str(results_path)) sims = data.get("simulations") or [] rewards = [_reward(sim) for sim in sims] db_values = [_db_match(sim) for sim in sims] diff --git a/benchmark/tau2/scripts/run_memory_v2_eval.py b/benchmark/tau2/scripts/run_memory_v2_eval.py index 67e68d2b44..bc1fd026af 100644 --- a/benchmark/tau2/scripts/run_memory_v2_eval.py +++ b/benchmark/tau2/scripts/run_memory_v2_eval.py @@ -2,14 +2,16 @@ from __future__ import annotations import argparse +import importlib import json import shutil import sys import time +from copy import deepcopy from pathlib import Path from typing import Any -from tau2_common import normalize_litellm_env +from tau2_common import assert_tau2_results_complete, normalize_litellm_env AGENT_NAME = "openviking_memory_agent" REPO_ROOT = Path(__file__).resolve().parents[3] @@ -45,6 +47,27 @@ def _add_tau2_to_path(tau2_repo: Path) -> None: sys.path.insert(0, str(src if src.is_dir() else tau2_repo)) +def _patch_tau2_auxiliary_llm_defaults(llm: str, llm_args: dict[str, Any]) -> None: + # TAU-2 exposes agent/user LLMs in TextRunConfig, but NL assertion scoring + # still reads module defaults. Keep the evaluator on the same configured + # model so benchmark runs do not fall back to inaccessible upstream defaults. + patches = { + "DEFAULT_LLM_NL_ASSERTIONS": llm, + "DEFAULT_LLM_NL_ASSERTIONS_ARGS": deepcopy(llm_args), + "DEFAULT_LLM_ENV_INTERFACE": llm, + "DEFAULT_LLM_ENV_INTERFACE_ARGS": deepcopy(llm_args), + } + for module_name in ( + "tau2.config", + "tau2.evaluator.evaluator_nl_assertions", + "tau2.environment.utils.interface_agent", + ): + module = importlib.import_module(module_name) + for name, value in patches.items(): + if hasattr(module, name): + setattr(module, name, deepcopy(value)) + + def _save_to_arg(path: Path) -> str: # Some TAU-2 versions append ".json"; newer versions treat save_to as a # run directory and write results.json under it. @@ -174,6 +197,7 @@ def _run_tau2( save_to: Path, ): _add_tau2_to_path(tau2_repo) + _patch_tau2_auxiliary_llm_defaults(agent_llm, agent_llm_args) from tau2.data_model.simulation import RunConfig, TextRunConfig from tau2.run import run_domain @@ -300,6 +324,7 @@ def _train(args: argparse.Namespace, train_results: Path, corpus_manifest: Path) ) data = json.loads(train_results.read_text()) + assert_tau2_results_complete(data, context=f"{args.domain} train") client = _client(args) committed = [] try: @@ -652,6 +677,9 @@ def main() -> int: seed=args.seed, save_to=eval_results, ) + assert_tau2_results_complete( + json.loads(eval_results.read_text()), context=f"{args.domain} eval" + ) summary = { "run_label": args.run_label, "domain": args.domain, diff --git a/benchmark/tau2/scripts/tau2_common.py b/benchmark/tau2/scripts/tau2_common.py index a8b5ce2013..7a1dfd1d93 100755 --- a/benchmark/tau2/scripts/tau2_common.py +++ b/benchmark/tau2/scripts/tau2_common.py @@ -11,7 +11,6 @@ import yaml - TAU2_DIR = Path(__file__).resolve().parents[1] REPO_ROOT = TAU2_DIR.parents[1] CONFIRMATION_AWARE_UPSTREAM_PR = "https://github.com/sierra-research/tau2-bench/pull/297" @@ -63,6 +62,7 @@ def normalize_litellm_env() -> dict[str, Any]: def render_env(value: Any) -> Any: if isinstance(value, str): + def replace(match: re.Match[str]) -> str: name = match.group(1) default = match.group(2) or "" @@ -79,11 +79,7 @@ def replace(match: re.Match[str]) -> str: def deep_merge(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]: merged = dict(base) for key, value in override.items(): - if ( - key in merged - and isinstance(merged[key], dict) - and isinstance(value, dict) - ): + if key in merged and isinstance(merged[key], dict) and isinstance(value, dict): merged[key] = deep_merge(merged[key], value) else: merged[key] = value @@ -125,6 +121,41 @@ def write_json(path: Path, payload: Any) -> None: ) +def tau2_result_failures(data: dict[str, Any], *, expected_trials: int = 1) -> list[str]: + tasks = data.get("tasks") or [] + simulations = data.get("simulations") or [] + failures: list[str] = [] + if tasks: + expected = len(tasks) * expected_trials + if len(simulations) != expected: + failures.append(f"expected {expected} simulations, found {len(simulations)}") + + for sim in simulations: + info = sim.get("info") or {} + termination_reason = str(sim.get("termination_reason") or "") + if info.get("failed_after_attempts") or "infrastructure_error" in termination_reason: + failures.append( + "task=" + f"{sim.get('task_id')} trial={sim.get('trial', 0)} " + f"termination={termination_reason} error={info.get('error') or info.get('error_type')}" + ) + elif not sim.get("messages"): + failures.append( + f"task={sim.get('task_id')} trial={sim.get('trial', 0)} has no messages" + ) + return failures + + +def assert_tau2_results_complete( + data: dict[str, Any], *, context: str, expected_trials: int = 1 +) -> None: + failures = tau2_result_failures(data, expected_trials=expected_trials) + if failures: + preview = "; ".join(failures[:5]) + more = f"; ... {len(failures) - 5} more" if len(failures) > 5 else "" + raise RuntimeError(f"{context} produced invalid TAU-2 results: {preview}{more}") + + def strategy_ids(config: dict[str, Any]) -> list[str]: strategies = config.get("strategies") or [] if not isinstance(strategies, list): @@ -219,9 +250,7 @@ def user_simulator_policy(config: dict[str, Any]) -> str: policy = config.get("eval", {}).get("user_simulator_policy", "official") policy = str(policy) if policy not in {"official", "confirmation_aware"}: - raise ValueError( - "eval.user_simulator_policy must be 'official' or 'confirmation_aware'" - ) + raise ValueError("eval.user_simulator_policy must be 'official' or 'confirmation_aware'") return policy From dc12e32eb40978ed780ec18416474f79af46a7dc Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Thu, 14 May 2026 00:31:49 +0800 Subject: [PATCH 12/42] bench(tau2): support category annotation sidecars --- benchmark/tau2/README.md | 11 +- benchmark/tau2/config/category_rerank.yaml | 3 + benchmark/tau2/scripts/category_rerank.py | 290 ++++++++++++++++++- benchmark/tau2/scripts/run_eval.py | 48 ++- tests/benchmark/test_tau2_category_rerank.py | 84 ++++++ 5 files changed, 415 insertions(+), 21 deletions(-) diff --git a/benchmark/tau2/README.md b/benchmark/tau2/README.md index 1aa7fafe80..1eed4a66e3 100644 --- a/benchmark/tau2/README.md +++ b/benchmark/tau2/README.md @@ -156,9 +156,14 @@ prepared in parallel with `benchmark.corpus_prepare_concurrency`; session commits inside one corpus remain serial to preserve OpenViking write semantics. `config/category_rerank.yaml` keeps the PR-B trajectory memory route and enables -an adapter-local FGMemory-style probe: pre-write recall, annotation category -rerank, and the retail scope prompt used by the Harness High-TrajView/FGMemory -route. The category sub-policy follows the S84 component settings, but the +an adapter-local FGMemory-style probe: pre-write recall, category rerank with +Harness `memory_category_annotation.v0` sidecar support, and the retail scope +prompt used by the Harness High-TrajView/FGMemory route. When sidecar files are +provided through `OPENVIKING_TAU2_CATEGORY_ANNOTATION_FILES` or +`AGENT_HARNESS_TAU2_CATEGORY_ANNOTATION_FILES`, the reranker uses those +query/memory annotations first; otherwise it falls back to the local category +catalog keyword classifier for smoke runs. The category sub-policy follows the +S84 component settings, but the alignment target is the red-box S89/FGMemory high result: retrieve 6, keep same-category candidates, inject at most 2, skip injection when no positive category match exists, and apply the scope/applicability prompt at the system diff --git a/benchmark/tau2/config/category_rerank.yaml b/benchmark/tau2/config/category_rerank.yaml index 86cde2aff4..ed918a04e3 100644 --- a/benchmark/tau2/config/category_rerank.yaml +++ b/benchmark/tau2/config/category_rerank.yaml @@ -23,6 +23,9 @@ strategies: category_rerank: enabled: true catalog_path: benchmark/tau2/config/category_catalog.json + annotation_files_env: + - OPENVIKING_TAU2_CATEGORY_ANNOTATION_FILES + - AGENT_HARNESS_TAU2_CATEGORY_ANNOTATION_FILES apply_nodes: - before_write_tool_call retrieve_limit: 6 diff --git a/benchmark/tau2/scripts/category_rerank.py b/benchmark/tau2/scripts/category_rerank.py index 6045b4c696..da6eac438e 100644 --- a/benchmark/tau2/scripts/category_rerank.py +++ b/benchmark/tau2/scripts/category_rerank.py @@ -1,6 +1,7 @@ from __future__ import annotations import json +import os import re from dataclasses import dataclass from pathlib import Path @@ -97,6 +98,7 @@ def __init__( apply_nodes: set[str], catalog: dict[str, list[CategoryEntry]], load_report: dict[str, Any], + annotation_index: dict[str, Any], retrieve_limit: int | None, inject_limit: int | None, mismatch_policy: str, @@ -108,6 +110,7 @@ def __init__( self.apply_nodes = apply_nodes self.catalog = catalog self.load_report = load_report + self.annotation_index = annotation_index self.retrieve_limit = retrieve_limit self.inject_limit = inject_limit self.mismatch_policy = mismatch_policy @@ -124,6 +127,7 @@ def from_payload(cls, payload: dict[str, Any] | None, *, repo_root: Path) -> "Ca catalog, load_report = _load_catalog(payload.get("catalog_path"), repo_root=repo_root) if not load_report.get("loaded"): raise ValueError(f"category rerank catalog failed to load: {load_report}") + annotation_index = _load_annotation_index(payload, repo_root=repo_root) else: catalog = {} load_report = { @@ -133,6 +137,7 @@ def from_payload(cls, payload: dict[str, Any] | None, *, repo_root: Path) -> "Ca "category_count": 0, "errors": [], } + annotation_index = _empty_annotation_index() mismatch_policy = str(payload.get("mismatch_policy") or "").strip() positive_match_required = _as_bool( payload.get("positive_match_required"), @@ -145,6 +150,7 @@ def from_payload(cls, payload: dict[str, Any] | None, *, repo_root: Path) -> "Ca apply_nodes=apply_nodes, catalog=catalog, load_report=load_report, + annotation_index=annotation_index, retrieve_limit=_as_int(payload.get("retrieve_limit"), 0) or None, inject_limit=_as_int(payload.get("inject_limit"), 0) or None, mismatch_policy=mismatch_policy or "none", @@ -169,6 +175,7 @@ def summary(self) -> dict[str, Any]: "no_match_policy": self.no_match_policy, "search_score_weight": self.search_score_weight, "catalog": self.load_report, + "annotation_sidecar": _annotation_summary(self.annotation_index), } def select( @@ -198,17 +205,18 @@ def select( "positive_match_required": self.positive_match_required, "selection_policy": "score_sort", "catalog": self.load_report, - "loaded_files": _loaded_files(self.load_report), - "load_errors": self.load_report.get("errors") or [], + "annotation_sidecar": _annotation_summary(self.annotation_index), + "loaded_files": _loaded_files(self.load_report, self.annotation_index), + "load_errors": _load_errors(self.load_report, self.annotation_index), } return selected, trace_rows, diagnostics domain_entries = self.catalog.get(str(domain).lower(), []) - query_annotation = _annotate_text( + query_annotation = _query_annotation( + self.annotation_index, domain_entries, - query, - trigger_field="query_triggers", - subject_type="query", + domain=domain, + query=query, ) scored = [] candidates = [] @@ -220,11 +228,11 @@ def select( str(row.get("level") or ""), ] ) - memory_annotation = _annotate_text( + memory_annotation = _memory_annotation( + self.annotation_index, domain_entries, - memory_text, - trigger_field="memory_triggers", - subject_type="memory", + row=row, + text=memory_text, ) score, reasons, match_flags = _candidate_score( query_annotation, @@ -316,16 +324,171 @@ def select( "candidate_count": len(candidates), "candidates": candidates, "catalog": self.load_report, - "loaded_files": _loaded_files(self.load_report), - "load_errors": self.load_report.get("errors") or [], + "annotation_sidecar": _annotation_summary(self.annotation_index), + "loaded_files": _loaded_files(self.load_report, self.annotation_index), + "load_errors": _load_errors(self.load_report, self.annotation_index), } return selected, trace_rows, diagnostics -def _loaded_files(load_report: dict[str, Any]) -> list[str]: +def _loaded_files( + load_report: dict[str, Any], + annotation_index: dict[str, Any] | None = None, +) -> list[str]: + files: list[str] = [] if load_report.get("loaded") and load_report.get("path"): - return [str(load_report["path"])] - return [] + files.append(str(load_report["path"])) + if isinstance(annotation_index, dict): + files.extend(str(row.get("path")) for row in annotation_index.get("loaded_files") or []) + return files + + +def _load_errors( + load_report: dict[str, Any], + annotation_index: dict[str, Any] | None = None, +) -> list[Any]: + errors = list(load_report.get("errors") or []) + if isinstance(annotation_index, dict): + errors.extend(annotation_index.get("load_errors") or []) + return errors + + +def _empty_annotation_index() -> dict[str, Any]: + return { + "by_key": {}, + "loaded_files": [], + "load_errors": [], + "row_count": 0, + "enabled": False, + } + + +def _annotation_summary(index: dict[str, Any]) -> dict[str, Any]: + return { + "enabled": bool(index.get("enabled")), + "loaded": bool(index.get("loaded_files")), + "loaded_files": index.get("loaded_files") or [], + "row_count": index.get("row_count") or 0, + "key_count": len(index.get("by_key") or {}), + "load_errors": index.get("load_errors") or [], + } + + +def _resolve_path(raw_path: Any, *, repo_root: Path) -> Path: + path = Path(str(raw_path)).expanduser() + if not path.is_absolute(): + path = repo_root / path + return path + + +def _split_path_text(value: Any) -> list[str]: + if isinstance(value, list): + return [str(item).strip() for item in value if str(item).strip()] + if isinstance(value, dict): + return [item for raw in value.values() for item in _split_path_text(raw)] + text = str(value or "").strip() + if not text: + return [] + return [part.strip() for part in re.split(r"[:\n,]", text) if part.strip()] + + +def _annotation_file_values(payload: dict[str, Any]) -> list[str]: + values = _split_path_text(payload.get("annotation_files")) + env_names = _as_list(payload.get("annotation_files_env")) or [ + "OPENVIKING_TAU2_CATEGORY_ANNOTATION_FILES", + "AGENT_HARNESS_TAU2_CATEGORY_ANNOTATION_FILES", + ] + for name in env_names: + values.extend(_split_path_text(os.environ.get(name))) + return list(dict.fromkeys(values)) + + +def _category_payload(annotation: dict[str, Any]) -> dict[str, Any]: + category = annotation.get("category") if isinstance(annotation.get("category"), dict) else {} + ranking = ( + annotation.get("ranking_features") + if isinstance(annotation.get("ranking_features"), dict) + else {} + ) + catalog_match = ( + category.get("catalog_match") if isinstance(category.get("catalog_match"), dict) else {} + ) + payload = { + "subject_type": ( + annotation.get("subject", {}).get("subject_type") + if isinstance(annotation.get("subject"), dict) + else None + ), + "category_source": category.get("category_source") + or ranking.get("category_source") + or "annotation_sidecar", + "matched": True, + "primary_category_id": catalog_match.get("matched_category_id"), + "category1": category.get("category1") or ranking.get("category1"), + "category2": category.get("category2") or ranking.get("category2"), + "confidence": category.get("confidence") or ranking.get("confidence"), + "catalog_match_decision": catalog_match.get("decision"), + "annotation_id": annotation.get("annotation_id") or annotation.get("request_id"), + } + return {key: value for key, value in payload.items() if value not in (None, "", [])} + + +def _slug_identity(value: str) -> str: + cleaned = re.sub(r"[^a-zA-Z0-9_.-]+", "_", value.strip()) + cleaned = re.sub(r"_+", "_", cleaned).strip("_.-") + return cleaned + + +def _annotation_lookup_keys(value: Any) -> list[str]: + text = str(value or "").strip() + if not text: + return [] + slug = _slug_identity(text) + keys = [text, slug] + if text.startswith("viking://"): + keys.extend([f"openviking_memory_{slug}", f"openviking_memory_{text}"]) + return list(dict.fromkeys(key for key in keys if key)) + + +def _annotation_index_put(index: dict[str, Any], key: Any, annotation: dict[str, Any]) -> None: + for candidate in _annotation_lookup_keys(key): + index["by_key"][candidate] = annotation + + +def _load_annotation_index(payload: dict[str, Any], *, repo_root: Path) -> dict[str, Any]: + index = _empty_annotation_index() + index["enabled"] = True + for raw_path in _annotation_file_values(payload): + path = _resolve_path(raw_path, repo_root=repo_root) + if not path.is_file(): + index["load_errors"].append({"path": str(path), "error": "file_not_found"}) + continue + loaded = 0 + with path.open("r", encoding="utf-8") as handle: + for line_number, line in enumerate(handle, start=1): + if not line.strip(): + continue + try: + row = json.loads(line) + except json.JSONDecodeError as exc: + index["load_errors"].append( + {"path": str(path), "line": line_number, "error": str(exc)} + ) + continue + if not isinstance(row, dict): + continue + subject = row.get("subject") if isinstance(row.get("subject"), dict) else {} + for key in ( + row.get("annotation_id"), + row.get("request_id"), + subject.get("subject_id"), + subject.get("subject_ref"), + ): + _annotation_index_put(index, key, row) + loaded += 1 + index["row_count"] += loaded + index["loaded_files"].append({"path": str(path), "rows": loaded}) + return index def _load_catalog(raw_path: Any, *, repo_root: Path) -> tuple[dict[str, list[CategoryEntry]], dict[str, Any]]: @@ -422,6 +585,103 @@ def _annotate_text( } +_WRITE_TOOL_PREFIXES = ( + "toggle_", + "enable_", + "disable_", + "set_", + "reset_", + "update_", + "modify_", + "cancel_", + "book_", + "exchange_", + "return_", + "grant_", + "reboot_", +) + + +def _lookup_annotation(index: dict[str, Any], keys: list[str], *, subject_type: str) -> dict[str, Any] | None: + by_key = index.get("by_key") if isinstance(index.get("by_key"), dict) else {} + for key in keys: + for candidate in _annotation_lookup_keys(key): + row = by_key.get(candidate) + if not isinstance(row, dict): + continue + subject = row.get("subject") if isinstance(row.get("subject"), dict) else {} + if subject.get("subject_type") == subject_type: + return row + return None + + +def _query_signature_from_text(domain: str, query: str) -> str | None: + names = [] + for name in re.findall(r"\b([a-zA-Z_][a-zA-Z0-9_]*)\s*\(", query): + if name.startswith(_WRITE_TOOL_PREFIXES): + names.append(name) + if not names: + return None + return "|".join( + [ + "tau2", + str(domain).strip().lower() or "unknown", + "pre_write_action", + "tools=" + ",".join(sorted(set(names))), + ] + ) + + +def _query_annotation( + index: dict[str, Any], + entries: list[CategoryEntry], + *, + domain: str, + query: str, +) -> dict[str, Any]: + signature = _query_signature_from_text(domain, query) + keys = [query] + if signature: + signature_slug = _slug_identity(signature) + keys.extend([signature, signature_slug, f"tau2_query_signature_{signature_slug}"]) + annotation = _lookup_annotation(index, keys, subject_type="query") + if annotation: + payload = _category_payload(annotation) + payload["subject_type"] = "query" + payload["category_source"] = payload.get("category_source") or "annotation_sidecar" + if signature: + payload["query_signature"] = signature + return payload + return _annotate_text( + entries, + query, + trigger_field="query_triggers", + subject_type="query", + ) + + +def _memory_annotation( + index: dict[str, Any], + entries: list[CategoryEntry], + *, + row: dict[str, Any], + text: str, +) -> dict[str, Any]: + keys = [str(row.get("uri") or ""), str(row.get("memory_id") or "")] + annotation = _lookup_annotation(index, keys, subject_type="memory") + if annotation: + payload = _category_payload(annotation) + payload["subject_type"] = "memory" + payload["category_source"] = payload.get("category_source") or "annotation_sidecar" + return payload + return _annotate_text( + entries, + text, + trigger_field="memory_triggers", + subject_type="memory", + ) + + def _values(payload: dict[str, Any], key: str) -> set[str]: value = payload.get(key) if isinstance(value, str) and value.strip(): diff --git a/benchmark/tau2/scripts/run_eval.py b/benchmark/tau2/scripts/run_eval.py index 130c421ed9..4664919a3e 100755 --- a/benchmark/tau2/scripts/run_eval.py +++ b/benchmark/tau2/scripts/run_eval.py @@ -4,6 +4,8 @@ import argparse import importlib.util import json +import os +import re import subprocess import sys from concurrent.futures import ThreadPoolExecutor, as_completed @@ -31,6 +33,24 @@ REPO_ROOT = Path(__file__).resolve().parents[3] +def _split_path_text(value: Any) -> list[str]: + if isinstance(value, list): + return [str(item).strip() for item in value if str(item).strip()] + if isinstance(value, dict): + return [item for raw in value.values() for item in _split_path_text(raw)] + text = str(value or "").strip() + if not text: + return [] + return [part.strip() for part in re.split(r"[:\n,]", text) if part.strip()] + + +def _resolve_repo_path(raw_path: str) -> Path: + path = Path(raw_path).expanduser() + if not path.is_absolute(): + path = REPO_ROOT / path + return path + + def _reward(sim: dict[str, Any]) -> float: info = sim.get("reward_info") or {} value = info.get("reward", sim.get("reward", 0.0)) @@ -547,15 +567,37 @@ def _preflight(config: dict[str, Any], out: Path, *, strict: bool) -> int: if not isinstance(category_rerank, dict) or not category_rerank.get("enabled"): continue raw_catalog_path = category_rerank.get("catalog_path") - catalog_path = Path(str(raw_catalog_path or "")).expanduser() - if raw_catalog_path and not catalog_path.is_absolute(): - catalog_path = REPO_ROOT / catalog_path + catalog_path = _resolve_repo_path(str(raw_catalog_path or "")) exists = bool(raw_catalog_path and catalog_path.is_file()) + annotation_env_names = _split_path_text(category_rerank.get("annotation_files_env")) or [ + "OPENVIKING_TAU2_CATEGORY_ANNOTATION_FILES", + "AGENT_HARNESS_TAU2_CATEGORY_ANNOTATION_FILES", + ] + raw_annotation_files = _split_path_text(category_rerank.get("annotation_files")) + for env_name in annotation_env_names: + raw_annotation_files.extend(_split_path_text(os.environ.get(env_name))) + annotation_files = [] + for raw_annotation_file in dict.fromkeys(raw_annotation_files): + annotation_path = _resolve_repo_path(raw_annotation_file) + annotation_exists = annotation_path.is_file() + annotation_files.append( + { + "path": str(annotation_path), + "exists": annotation_exists, + } + ) + if strict and not annotation_exists: + errors.append( + f"missing category annotation sidecar for {strategy.get('id')}: " + f"{raw_annotation_file}" + ) category_rows.append( { "strategy_id": strategy.get("id"), "catalog_path": str(catalog_path) if raw_catalog_path else None, "exists": exists, + "annotation_files_env": annotation_env_names, + "annotation_files": annotation_files, "apply_nodes": category_rerank.get("apply_nodes"), "retrieve_limit": category_rerank.get("retrieve_limit"), "inject_limit": category_rerank.get("inject_limit"), diff --git a/tests/benchmark/test_tau2_category_rerank.py b/tests/benchmark/test_tau2_category_rerank.py index 14d8693ead..13a5b3d412 100644 --- a/tests/benchmark/test_tau2_category_rerank.py +++ b/tests/benchmark/test_tau2_category_rerank.py @@ -1,3 +1,4 @@ +import json from pathlib import Path from benchmark.tau2.scripts.category_rerank import CategoryReranker @@ -88,6 +89,89 @@ def test_category_rerank_skips_non_target_node() -> None: assert trace_rows[1]["selected_for_injection"] is False +def test_category_rerank_prefers_annotation_sidecar(tmp_path: Path) -> None: + sidecar = tmp_path / "annotations.jsonl" + memory_uri = "viking://agent/demo/memories/trajectories/sidecar_exchange.md" + query_subject = ( + "tau2_query_signature_tau2_retail_pre_write_action_tools_exchange_delivered_order_items" + ) + rows = [ + { + "schema_version": "memory_category_annotation.v0", + "annotation_id": f"query:{query_subject}:abc123", + "request_id": f"query:{query_subject}:abc123", + "subject": { + "subject_type": "query", + "subject_id": query_subject, + "domain": "retail", + }, + "category": { + "category1": "sidecar_query_family", + "category2": "sidecar_exact", + "category_source": "existing_catalog", + "confidence": 1.0, + }, + }, + { + "schema_version": "memory_category_annotation.v0", + "annotation_id": "memory:sidecar_exchange:abc123", + "request_id": "memory:sidecar_exchange:abc123", + "subject": { + "subject_type": "memory", + "subject_id": "sidecar_exchange", + "subject_ref": memory_uri, + "domain": "retail", + }, + "category": { + "category1": "sidecar_query_family", + "category2": "sidecar_exact", + "category_source": "llm_prompt", + "confidence": 1.0, + }, + }, + ] + sidecar.write_text("\n".join(json.dumps(row) for row in rows) + "\n") + reranker = CategoryReranker.from_payload( + { + "enabled": True, + "catalog_path": "benchmark/tau2/config/category_catalog.json", + "annotation_files": [str(sidecar)], + "apply_nodes": ["before_write_tool_call"], + "retrieve_limit": 6, + "inject_limit": 1, + "mismatch_policy": "keep_positive_match_drop_mismatch", + "no_match_policy": "skip_injection", + }, + repo_root=Path(__file__).resolve().parents[2], + ) + + selected, trace_rows, diagnostics = reranker.select( + domain="retail", + query="Before executing write-like tool call(s): exchange_delivered_order_items({})", + rows=[ + { + "uri": memory_uri, + "score": 0.1, + "_text": "This text would otherwise look like cancel_pending_order.", + }, + { + "uri": "viking://agent/demo/memories/trajectories/catalog_exchange.md", + "score": 0.9, + "_text": "Use exchange_delivered_order_items for a delivered order exchange.", + }, + ], + decision_node="before_write_tool_call", + base_limit=2, + ) + + assert diagnostics["annotation_sidecar"]["row_count"] == 2 + assert diagnostics["query_category"]["annotation_id"] == f"query:{query_subject}:abc123" + assert [row["uri"] for row in selected] == [memory_uri] + assert trace_rows[0]["memory_category1_prompt"] == "sidecar_query_family" + assert trace_rows[0]["query_category2_prompt"] == "sidecar_exact" + assert trace_rows[1]["selected_for_injection"] is False + + def test_scope_prompt_loads_domain_file(tmp_path: Path) -> None: prompt = tmp_path / "retail_scope.md" prompt.write_text("same order") From 7c88bc4c533113d71f105cb3afba9a1e4ff28a1d Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Thu, 14 May 2026 00:44:04 +0800 Subject: [PATCH 13/42] Revert "bench(tau2): support category annotation sidecars" This reverts commit dc12e32eb40978ed780ec18416474f79af46a7dc. --- benchmark/tau2/README.md | 11 +- benchmark/tau2/config/category_rerank.yaml | 3 - benchmark/tau2/scripts/category_rerank.py | 290 +------------------ benchmark/tau2/scripts/run_eval.py | 48 +-- tests/benchmark/test_tau2_category_rerank.py | 84 ------ 5 files changed, 21 insertions(+), 415 deletions(-) diff --git a/benchmark/tau2/README.md b/benchmark/tau2/README.md index 1eed4a66e3..1aa7fafe80 100644 --- a/benchmark/tau2/README.md +++ b/benchmark/tau2/README.md @@ -156,14 +156,9 @@ prepared in parallel with `benchmark.corpus_prepare_concurrency`; session commits inside one corpus remain serial to preserve OpenViking write semantics. `config/category_rerank.yaml` keeps the PR-B trajectory memory route and enables -an adapter-local FGMemory-style probe: pre-write recall, category rerank with -Harness `memory_category_annotation.v0` sidecar support, and the retail scope -prompt used by the Harness High-TrajView/FGMemory route. When sidecar files are -provided through `OPENVIKING_TAU2_CATEGORY_ANNOTATION_FILES` or -`AGENT_HARNESS_TAU2_CATEGORY_ANNOTATION_FILES`, the reranker uses those -query/memory annotations first; otherwise it falls back to the local category -catalog keyword classifier for smoke runs. The category sub-policy follows the -S84 component settings, but the +an adapter-local FGMemory-style probe: pre-write recall, annotation category +rerank, and the retail scope prompt used by the Harness High-TrajView/FGMemory +route. The category sub-policy follows the S84 component settings, but the alignment target is the red-box S89/FGMemory high result: retrieve 6, keep same-category candidates, inject at most 2, skip injection when no positive category match exists, and apply the scope/applicability prompt at the system diff --git a/benchmark/tau2/config/category_rerank.yaml b/benchmark/tau2/config/category_rerank.yaml index ed918a04e3..86cde2aff4 100644 --- a/benchmark/tau2/config/category_rerank.yaml +++ b/benchmark/tau2/config/category_rerank.yaml @@ -23,9 +23,6 @@ strategies: category_rerank: enabled: true catalog_path: benchmark/tau2/config/category_catalog.json - annotation_files_env: - - OPENVIKING_TAU2_CATEGORY_ANNOTATION_FILES - - AGENT_HARNESS_TAU2_CATEGORY_ANNOTATION_FILES apply_nodes: - before_write_tool_call retrieve_limit: 6 diff --git a/benchmark/tau2/scripts/category_rerank.py b/benchmark/tau2/scripts/category_rerank.py index da6eac438e..6045b4c696 100644 --- a/benchmark/tau2/scripts/category_rerank.py +++ b/benchmark/tau2/scripts/category_rerank.py @@ -1,7 +1,6 @@ from __future__ import annotations import json -import os import re from dataclasses import dataclass from pathlib import Path @@ -98,7 +97,6 @@ def __init__( apply_nodes: set[str], catalog: dict[str, list[CategoryEntry]], load_report: dict[str, Any], - annotation_index: dict[str, Any], retrieve_limit: int | None, inject_limit: int | None, mismatch_policy: str, @@ -110,7 +108,6 @@ def __init__( self.apply_nodes = apply_nodes self.catalog = catalog self.load_report = load_report - self.annotation_index = annotation_index self.retrieve_limit = retrieve_limit self.inject_limit = inject_limit self.mismatch_policy = mismatch_policy @@ -127,7 +124,6 @@ def from_payload(cls, payload: dict[str, Any] | None, *, repo_root: Path) -> "Ca catalog, load_report = _load_catalog(payload.get("catalog_path"), repo_root=repo_root) if not load_report.get("loaded"): raise ValueError(f"category rerank catalog failed to load: {load_report}") - annotation_index = _load_annotation_index(payload, repo_root=repo_root) else: catalog = {} load_report = { @@ -137,7 +133,6 @@ def from_payload(cls, payload: dict[str, Any] | None, *, repo_root: Path) -> "Ca "category_count": 0, "errors": [], } - annotation_index = _empty_annotation_index() mismatch_policy = str(payload.get("mismatch_policy") or "").strip() positive_match_required = _as_bool( payload.get("positive_match_required"), @@ -150,7 +145,6 @@ def from_payload(cls, payload: dict[str, Any] | None, *, repo_root: Path) -> "Ca apply_nodes=apply_nodes, catalog=catalog, load_report=load_report, - annotation_index=annotation_index, retrieve_limit=_as_int(payload.get("retrieve_limit"), 0) or None, inject_limit=_as_int(payload.get("inject_limit"), 0) or None, mismatch_policy=mismatch_policy or "none", @@ -175,7 +169,6 @@ def summary(self) -> dict[str, Any]: "no_match_policy": self.no_match_policy, "search_score_weight": self.search_score_weight, "catalog": self.load_report, - "annotation_sidecar": _annotation_summary(self.annotation_index), } def select( @@ -205,18 +198,17 @@ def select( "positive_match_required": self.positive_match_required, "selection_policy": "score_sort", "catalog": self.load_report, - "annotation_sidecar": _annotation_summary(self.annotation_index), - "loaded_files": _loaded_files(self.load_report, self.annotation_index), - "load_errors": _load_errors(self.load_report, self.annotation_index), + "loaded_files": _loaded_files(self.load_report), + "load_errors": self.load_report.get("errors") or [], } return selected, trace_rows, diagnostics domain_entries = self.catalog.get(str(domain).lower(), []) - query_annotation = _query_annotation( - self.annotation_index, + query_annotation = _annotate_text( domain_entries, - domain=domain, - query=query, + query, + trigger_field="query_triggers", + subject_type="query", ) scored = [] candidates = [] @@ -228,11 +220,11 @@ def select( str(row.get("level") or ""), ] ) - memory_annotation = _memory_annotation( - self.annotation_index, + memory_annotation = _annotate_text( domain_entries, - row=row, - text=memory_text, + memory_text, + trigger_field="memory_triggers", + subject_type="memory", ) score, reasons, match_flags = _candidate_score( query_annotation, @@ -324,171 +316,16 @@ def select( "candidate_count": len(candidates), "candidates": candidates, "catalog": self.load_report, - "annotation_sidecar": _annotation_summary(self.annotation_index), - "loaded_files": _loaded_files(self.load_report, self.annotation_index), - "load_errors": _load_errors(self.load_report, self.annotation_index), + "loaded_files": _loaded_files(self.load_report), + "load_errors": self.load_report.get("errors") or [], } return selected, trace_rows, diagnostics -def _loaded_files( - load_report: dict[str, Any], - annotation_index: dict[str, Any] | None = None, -) -> list[str]: - files: list[str] = [] +def _loaded_files(load_report: dict[str, Any]) -> list[str]: if load_report.get("loaded") and load_report.get("path"): - files.append(str(load_report["path"])) - if isinstance(annotation_index, dict): - files.extend(str(row.get("path")) for row in annotation_index.get("loaded_files") or []) - return files - - -def _load_errors( - load_report: dict[str, Any], - annotation_index: dict[str, Any] | None = None, -) -> list[Any]: - errors = list(load_report.get("errors") or []) - if isinstance(annotation_index, dict): - errors.extend(annotation_index.get("load_errors") or []) - return errors - - -def _empty_annotation_index() -> dict[str, Any]: - return { - "by_key": {}, - "loaded_files": [], - "load_errors": [], - "row_count": 0, - "enabled": False, - } - - -def _annotation_summary(index: dict[str, Any]) -> dict[str, Any]: - return { - "enabled": bool(index.get("enabled")), - "loaded": bool(index.get("loaded_files")), - "loaded_files": index.get("loaded_files") or [], - "row_count": index.get("row_count") or 0, - "key_count": len(index.get("by_key") or {}), - "load_errors": index.get("load_errors") or [], - } - - -def _resolve_path(raw_path: Any, *, repo_root: Path) -> Path: - path = Path(str(raw_path)).expanduser() - if not path.is_absolute(): - path = repo_root / path - return path - - -def _split_path_text(value: Any) -> list[str]: - if isinstance(value, list): - return [str(item).strip() for item in value if str(item).strip()] - if isinstance(value, dict): - return [item for raw in value.values() for item in _split_path_text(raw)] - text = str(value or "").strip() - if not text: - return [] - return [part.strip() for part in re.split(r"[:\n,]", text) if part.strip()] - - -def _annotation_file_values(payload: dict[str, Any]) -> list[str]: - values = _split_path_text(payload.get("annotation_files")) - env_names = _as_list(payload.get("annotation_files_env")) or [ - "OPENVIKING_TAU2_CATEGORY_ANNOTATION_FILES", - "AGENT_HARNESS_TAU2_CATEGORY_ANNOTATION_FILES", - ] - for name in env_names: - values.extend(_split_path_text(os.environ.get(name))) - return list(dict.fromkeys(values)) - - -def _category_payload(annotation: dict[str, Any]) -> dict[str, Any]: - category = annotation.get("category") if isinstance(annotation.get("category"), dict) else {} - ranking = ( - annotation.get("ranking_features") - if isinstance(annotation.get("ranking_features"), dict) - else {} - ) - catalog_match = ( - category.get("catalog_match") if isinstance(category.get("catalog_match"), dict) else {} - ) - payload = { - "subject_type": ( - annotation.get("subject", {}).get("subject_type") - if isinstance(annotation.get("subject"), dict) - else None - ), - "category_source": category.get("category_source") - or ranking.get("category_source") - or "annotation_sidecar", - "matched": True, - "primary_category_id": catalog_match.get("matched_category_id"), - "category1": category.get("category1") or ranking.get("category1"), - "category2": category.get("category2") or ranking.get("category2"), - "confidence": category.get("confidence") or ranking.get("confidence"), - "catalog_match_decision": catalog_match.get("decision"), - "annotation_id": annotation.get("annotation_id") or annotation.get("request_id"), - } - return {key: value for key, value in payload.items() if value not in (None, "", [])} - - -def _slug_identity(value: str) -> str: - cleaned = re.sub(r"[^a-zA-Z0-9_.-]+", "_", value.strip()) - cleaned = re.sub(r"_+", "_", cleaned).strip("_.-") - return cleaned - - -def _annotation_lookup_keys(value: Any) -> list[str]: - text = str(value or "").strip() - if not text: - return [] - slug = _slug_identity(text) - keys = [text, slug] - if text.startswith("viking://"): - keys.extend([f"openviking_memory_{slug}", f"openviking_memory_{text}"]) - return list(dict.fromkeys(key for key in keys if key)) - - -def _annotation_index_put(index: dict[str, Any], key: Any, annotation: dict[str, Any]) -> None: - for candidate in _annotation_lookup_keys(key): - index["by_key"][candidate] = annotation - - -def _load_annotation_index(payload: dict[str, Any], *, repo_root: Path) -> dict[str, Any]: - index = _empty_annotation_index() - index["enabled"] = True - for raw_path in _annotation_file_values(payload): - path = _resolve_path(raw_path, repo_root=repo_root) - if not path.is_file(): - index["load_errors"].append({"path": str(path), "error": "file_not_found"}) - continue - loaded = 0 - with path.open("r", encoding="utf-8") as handle: - for line_number, line in enumerate(handle, start=1): - if not line.strip(): - continue - try: - row = json.loads(line) - except json.JSONDecodeError as exc: - index["load_errors"].append( - {"path": str(path), "line": line_number, "error": str(exc)} - ) - continue - if not isinstance(row, dict): - continue - subject = row.get("subject") if isinstance(row.get("subject"), dict) else {} - for key in ( - row.get("annotation_id"), - row.get("request_id"), - subject.get("subject_id"), - subject.get("subject_ref"), - ): - _annotation_index_put(index, key, row) - loaded += 1 - index["row_count"] += loaded - index["loaded_files"].append({"path": str(path), "rows": loaded}) - return index + return [str(load_report["path"])] + return [] def _load_catalog(raw_path: Any, *, repo_root: Path) -> tuple[dict[str, list[CategoryEntry]], dict[str, Any]]: @@ -585,103 +422,6 @@ def _annotate_text( } -_WRITE_TOOL_PREFIXES = ( - "toggle_", - "enable_", - "disable_", - "set_", - "reset_", - "update_", - "modify_", - "cancel_", - "book_", - "exchange_", - "return_", - "grant_", - "reboot_", -) - - -def _lookup_annotation(index: dict[str, Any], keys: list[str], *, subject_type: str) -> dict[str, Any] | None: - by_key = index.get("by_key") if isinstance(index.get("by_key"), dict) else {} - for key in keys: - for candidate in _annotation_lookup_keys(key): - row = by_key.get(candidate) - if not isinstance(row, dict): - continue - subject = row.get("subject") if isinstance(row.get("subject"), dict) else {} - if subject.get("subject_type") == subject_type: - return row - return None - - -def _query_signature_from_text(domain: str, query: str) -> str | None: - names = [] - for name in re.findall(r"\b([a-zA-Z_][a-zA-Z0-9_]*)\s*\(", query): - if name.startswith(_WRITE_TOOL_PREFIXES): - names.append(name) - if not names: - return None - return "|".join( - [ - "tau2", - str(domain).strip().lower() or "unknown", - "pre_write_action", - "tools=" + ",".join(sorted(set(names))), - ] - ) - - -def _query_annotation( - index: dict[str, Any], - entries: list[CategoryEntry], - *, - domain: str, - query: str, -) -> dict[str, Any]: - signature = _query_signature_from_text(domain, query) - keys = [query] - if signature: - signature_slug = _slug_identity(signature) - keys.extend([signature, signature_slug, f"tau2_query_signature_{signature_slug}"]) - annotation = _lookup_annotation(index, keys, subject_type="query") - if annotation: - payload = _category_payload(annotation) - payload["subject_type"] = "query" - payload["category_source"] = payload.get("category_source") or "annotation_sidecar" - if signature: - payload["query_signature"] = signature - return payload - return _annotate_text( - entries, - query, - trigger_field="query_triggers", - subject_type="query", - ) - - -def _memory_annotation( - index: dict[str, Any], - entries: list[CategoryEntry], - *, - row: dict[str, Any], - text: str, -) -> dict[str, Any]: - keys = [str(row.get("uri") or ""), str(row.get("memory_id") or "")] - annotation = _lookup_annotation(index, keys, subject_type="memory") - if annotation: - payload = _category_payload(annotation) - payload["subject_type"] = "memory" - payload["category_source"] = payload.get("category_source") or "annotation_sidecar" - return payload - return _annotate_text( - entries, - text, - trigger_field="memory_triggers", - subject_type="memory", - ) - - def _values(payload: dict[str, Any], key: str) -> set[str]: value = payload.get(key) if isinstance(value, str) and value.strip(): diff --git a/benchmark/tau2/scripts/run_eval.py b/benchmark/tau2/scripts/run_eval.py index 4664919a3e..130c421ed9 100755 --- a/benchmark/tau2/scripts/run_eval.py +++ b/benchmark/tau2/scripts/run_eval.py @@ -4,8 +4,6 @@ import argparse import importlib.util import json -import os -import re import subprocess import sys from concurrent.futures import ThreadPoolExecutor, as_completed @@ -33,24 +31,6 @@ REPO_ROOT = Path(__file__).resolve().parents[3] -def _split_path_text(value: Any) -> list[str]: - if isinstance(value, list): - return [str(item).strip() for item in value if str(item).strip()] - if isinstance(value, dict): - return [item for raw in value.values() for item in _split_path_text(raw)] - text = str(value or "").strip() - if not text: - return [] - return [part.strip() for part in re.split(r"[:\n,]", text) if part.strip()] - - -def _resolve_repo_path(raw_path: str) -> Path: - path = Path(raw_path).expanduser() - if not path.is_absolute(): - path = REPO_ROOT / path - return path - - def _reward(sim: dict[str, Any]) -> float: info = sim.get("reward_info") or {} value = info.get("reward", sim.get("reward", 0.0)) @@ -567,37 +547,15 @@ def _preflight(config: dict[str, Any], out: Path, *, strict: bool) -> int: if not isinstance(category_rerank, dict) or not category_rerank.get("enabled"): continue raw_catalog_path = category_rerank.get("catalog_path") - catalog_path = _resolve_repo_path(str(raw_catalog_path or "")) + catalog_path = Path(str(raw_catalog_path or "")).expanduser() + if raw_catalog_path and not catalog_path.is_absolute(): + catalog_path = REPO_ROOT / catalog_path exists = bool(raw_catalog_path and catalog_path.is_file()) - annotation_env_names = _split_path_text(category_rerank.get("annotation_files_env")) or [ - "OPENVIKING_TAU2_CATEGORY_ANNOTATION_FILES", - "AGENT_HARNESS_TAU2_CATEGORY_ANNOTATION_FILES", - ] - raw_annotation_files = _split_path_text(category_rerank.get("annotation_files")) - for env_name in annotation_env_names: - raw_annotation_files.extend(_split_path_text(os.environ.get(env_name))) - annotation_files = [] - for raw_annotation_file in dict.fromkeys(raw_annotation_files): - annotation_path = _resolve_repo_path(raw_annotation_file) - annotation_exists = annotation_path.is_file() - annotation_files.append( - { - "path": str(annotation_path), - "exists": annotation_exists, - } - ) - if strict and not annotation_exists: - errors.append( - f"missing category annotation sidecar for {strategy.get('id')}: " - f"{raw_annotation_file}" - ) category_rows.append( { "strategy_id": strategy.get("id"), "catalog_path": str(catalog_path) if raw_catalog_path else None, "exists": exists, - "annotation_files_env": annotation_env_names, - "annotation_files": annotation_files, "apply_nodes": category_rerank.get("apply_nodes"), "retrieve_limit": category_rerank.get("retrieve_limit"), "inject_limit": category_rerank.get("inject_limit"), diff --git a/tests/benchmark/test_tau2_category_rerank.py b/tests/benchmark/test_tau2_category_rerank.py index 13a5b3d412..14d8693ead 100644 --- a/tests/benchmark/test_tau2_category_rerank.py +++ b/tests/benchmark/test_tau2_category_rerank.py @@ -1,4 +1,3 @@ -import json from pathlib import Path from benchmark.tau2.scripts.category_rerank import CategoryReranker @@ -89,89 +88,6 @@ def test_category_rerank_skips_non_target_node() -> None: assert trace_rows[1]["selected_for_injection"] is False -def test_category_rerank_prefers_annotation_sidecar(tmp_path: Path) -> None: - sidecar = tmp_path / "annotations.jsonl" - memory_uri = "viking://agent/demo/memories/trajectories/sidecar_exchange.md" - query_subject = ( - "tau2_query_signature_tau2_retail_pre_write_action_tools_exchange_delivered_order_items" - ) - rows = [ - { - "schema_version": "memory_category_annotation.v0", - "annotation_id": f"query:{query_subject}:abc123", - "request_id": f"query:{query_subject}:abc123", - "subject": { - "subject_type": "query", - "subject_id": query_subject, - "domain": "retail", - }, - "category": { - "category1": "sidecar_query_family", - "category2": "sidecar_exact", - "category_source": "existing_catalog", - "confidence": 1.0, - }, - }, - { - "schema_version": "memory_category_annotation.v0", - "annotation_id": "memory:sidecar_exchange:abc123", - "request_id": "memory:sidecar_exchange:abc123", - "subject": { - "subject_type": "memory", - "subject_id": "sidecar_exchange", - "subject_ref": memory_uri, - "domain": "retail", - }, - "category": { - "category1": "sidecar_query_family", - "category2": "sidecar_exact", - "category_source": "llm_prompt", - "confidence": 1.0, - }, - }, - ] - sidecar.write_text("\n".join(json.dumps(row) for row in rows) + "\n") - reranker = CategoryReranker.from_payload( - { - "enabled": True, - "catalog_path": "benchmark/tau2/config/category_catalog.json", - "annotation_files": [str(sidecar)], - "apply_nodes": ["before_write_tool_call"], - "retrieve_limit": 6, - "inject_limit": 1, - "mismatch_policy": "keep_positive_match_drop_mismatch", - "no_match_policy": "skip_injection", - }, - repo_root=Path(__file__).resolve().parents[2], - ) - - selected, trace_rows, diagnostics = reranker.select( - domain="retail", - query="Before executing write-like tool call(s): exchange_delivered_order_items({})", - rows=[ - { - "uri": memory_uri, - "score": 0.1, - "_text": "This text would otherwise look like cancel_pending_order.", - }, - { - "uri": "viking://agent/demo/memories/trajectories/catalog_exchange.md", - "score": 0.9, - "_text": "Use exchange_delivered_order_items for a delivered order exchange.", - }, - ], - decision_node="before_write_tool_call", - base_limit=2, - ) - - assert diagnostics["annotation_sidecar"]["row_count"] == 2 - assert diagnostics["query_category"]["annotation_id"] == f"query:{query_subject}:abc123" - assert [row["uri"] for row in selected] == [memory_uri] - assert trace_rows[0]["memory_category1_prompt"] == "sidecar_query_family" - assert trace_rows[0]["query_category2_prompt"] == "sidecar_exact" - assert trace_rows[1]["selected_for_injection"] is False - - def test_scope_prompt_loads_domain_file(tmp_path: Path) -> None: prompt = tmp_path / "retail_scope.md" prompt.write_text("same order") From 91f9edfcdd4b2157b89b4ef017d344e650985f7a Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Thu, 14 May 2026 00:51:10 +0800 Subject: [PATCH 14/42] docs(tau2): clarify self-generated category signals --- benchmark/tau2/README.md | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/benchmark/tau2/README.md b/benchmark/tau2/README.md index 1aa7fafe80..e5a6874c11 100644 --- a/benchmark/tau2/README.md +++ b/benchmark/tau2/README.md @@ -156,16 +156,18 @@ prepared in parallel with `benchmark.corpus_prepare_concurrency`; session commits inside one corpus remain serial to preserve OpenViking write semantics. `config/category_rerank.yaml` keeps the PR-B trajectory memory route and enables -an adapter-local FGMemory-style probe: pre-write recall, annotation category -rerank, and the retail scope prompt used by the Harness High-TrajView/FGMemory -route. The category sub-policy follows the S84 component settings, but the -alignment target is the red-box S89/FGMemory high result: retrieve 6, keep -same-category candidates, inject at most 2, skip injection when no positive -category match exists, and apply the scope/applicability prompt at the system -prompt injection point. Retrieval traces include the query category, candidate -memory categories, rerank reasons, selected rows, skipped rows, scope prompt -metadata, and the flat `*_category*_prompt` fields consumed by Harness -diagnostics. +an adapter-local FGMemory-style probe: pre-write recall, self-generated runtime +category signals, and the retail scope prompt used by the Harness +High-TrajView/FGMemory route. The category sub-policy follows the S84 component +settings, but the alignment target is the red-box S89/FGMemory high result: +retrieve 6, keep same-category candidates, inject at most 2, skip injection +when no positive category match exists, and apply the scope/applicability prompt +at the system prompt injection point. Runtime categories are generated from the +local TAU-2 category catalog, current pre-write query text, candidate write tool +names, retrieved trajectory text, and memory URIs; no Harness sidecar artifact +is required. Retrieval traces include the query category, candidate memory +categories, rerank reasons, selected rows, skipped rows, scope prompt metadata, +and flat `*_category*_prompt` fields kept compatible with Harness diagnostics. ## User Simulator Policy From 2b767f203e6676629120aa3eabdfec7bc962f2b9 Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Wed, 13 May 2026 23:50:30 +0800 Subject: [PATCH 15/42] fix(benchmark): guard tau2 infrastructure failures --- benchmark/tau2/scripts/run_eval.py | 2 + benchmark/tau2/scripts/run_memory_v2_eval.py | 30 +++++++- benchmark/tau2/scripts/tau2_common.py | 73 +++++++++++++++++--- 3 files changed, 95 insertions(+), 10 deletions(-) diff --git a/benchmark/tau2/scripts/run_eval.py b/benchmark/tau2/scripts/run_eval.py index f2c250a247..2fec4cbf0e 100755 --- a/benchmark/tau2/scripts/run_eval.py +++ b/benchmark/tau2/scripts/run_eval.py @@ -11,6 +11,7 @@ from typing import Any from tau2_common import ( + assert_tau2_results_complete, domains, load_config, normalize_litellm_env, @@ -49,6 +50,7 @@ def _db_match(sim: dict[str, Any]) -> bool | None: def _metrics_from_tau2_results(results_path: Path) -> dict[str, Any]: data = json.loads(results_path.read_text(encoding="utf-8")) + assert_tau2_results_complete(data, context=str(results_path)) sims = data.get("simulations") or [] rewards = [_reward(sim) for sim in sims] db_values = [_db_match(sim) for sim in sims] diff --git a/benchmark/tau2/scripts/run_memory_v2_eval.py b/benchmark/tau2/scripts/run_memory_v2_eval.py index 67e68d2b44..bc1fd026af 100644 --- a/benchmark/tau2/scripts/run_memory_v2_eval.py +++ b/benchmark/tau2/scripts/run_memory_v2_eval.py @@ -2,14 +2,16 @@ from __future__ import annotations import argparse +import importlib import json import shutil import sys import time +from copy import deepcopy from pathlib import Path from typing import Any -from tau2_common import normalize_litellm_env +from tau2_common import assert_tau2_results_complete, normalize_litellm_env AGENT_NAME = "openviking_memory_agent" REPO_ROOT = Path(__file__).resolve().parents[3] @@ -45,6 +47,27 @@ def _add_tau2_to_path(tau2_repo: Path) -> None: sys.path.insert(0, str(src if src.is_dir() else tau2_repo)) +def _patch_tau2_auxiliary_llm_defaults(llm: str, llm_args: dict[str, Any]) -> None: + # TAU-2 exposes agent/user LLMs in TextRunConfig, but NL assertion scoring + # still reads module defaults. Keep the evaluator on the same configured + # model so benchmark runs do not fall back to inaccessible upstream defaults. + patches = { + "DEFAULT_LLM_NL_ASSERTIONS": llm, + "DEFAULT_LLM_NL_ASSERTIONS_ARGS": deepcopy(llm_args), + "DEFAULT_LLM_ENV_INTERFACE": llm, + "DEFAULT_LLM_ENV_INTERFACE_ARGS": deepcopy(llm_args), + } + for module_name in ( + "tau2.config", + "tau2.evaluator.evaluator_nl_assertions", + "tau2.environment.utils.interface_agent", + ): + module = importlib.import_module(module_name) + for name, value in patches.items(): + if hasattr(module, name): + setattr(module, name, deepcopy(value)) + + def _save_to_arg(path: Path) -> str: # Some TAU-2 versions append ".json"; newer versions treat save_to as a # run directory and write results.json under it. @@ -174,6 +197,7 @@ def _run_tau2( save_to: Path, ): _add_tau2_to_path(tau2_repo) + _patch_tau2_auxiliary_llm_defaults(agent_llm, agent_llm_args) from tau2.data_model.simulation import RunConfig, TextRunConfig from tau2.run import run_domain @@ -300,6 +324,7 @@ def _train(args: argparse.Namespace, train_results: Path, corpus_manifest: Path) ) data = json.loads(train_results.read_text()) + assert_tau2_results_complete(data, context=f"{args.domain} train") client = _client(args) committed = [] try: @@ -652,6 +677,9 @@ def main() -> int: seed=args.seed, save_to=eval_results, ) + assert_tau2_results_complete( + json.loads(eval_results.read_text()), context=f"{args.domain} eval" + ) summary = { "run_label": args.run_label, "domain": args.domain, diff --git a/benchmark/tau2/scripts/tau2_common.py b/benchmark/tau2/scripts/tau2_common.py index a8b5ce2013..4f4505bd87 100755 --- a/benchmark/tau2/scripts/tau2_common.py +++ b/benchmark/tau2/scripts/tau2_common.py @@ -5,13 +5,13 @@ import re import shutil import subprocess +from collections import Counter from datetime import datetime, timezone from pathlib import Path from typing import Any import yaml - TAU2_DIR = Path(__file__).resolve().parents[1] REPO_ROOT = TAU2_DIR.parents[1] CONFIRMATION_AWARE_UPSTREAM_PR = "https://github.com/sierra-research/tau2-bench/pull/297" @@ -63,6 +63,7 @@ def normalize_litellm_env() -> dict[str, Any]: def render_env(value: Any) -> Any: if isinstance(value, str): + def replace(match: re.Match[str]) -> str: name = match.group(1) default = match.group(2) or "" @@ -79,11 +80,7 @@ def replace(match: re.Match[str]) -> str: def deep_merge(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]: merged = dict(base) for key, value in override.items(): - if ( - key in merged - and isinstance(merged[key], dict) - and isinstance(value, dict) - ): + if key in merged and isinstance(merged[key], dict) and isinstance(value, dict): merged[key] = deep_merge(merged[key], value) else: merged[key] = value @@ -125,6 +122,66 @@ def write_json(path: Path, payload: Any) -> None: ) +def tau2_result_failures(data: dict[str, Any], *, expected_trials: int = 1) -> list[str]: + tasks = data.get("tasks") or [] + simulations = data.get("simulations") or [] + failures: list[str] = [] + if tasks: + expected_task_ids = { + str(task.get("id", task.get("task_id"))) for task in tasks if isinstance(task, dict) + } + observed_task_ids = {str(sim.get("task_id")) for sim in simulations} + expected = len(tasks) * expected_trials + if len(simulations) != expected: + failures.append(f"expected {expected} simulations, found {len(simulations)}") + if observed_task_ids != expected_task_ids: + missing = sorted(expected_task_ids - observed_task_ids) + extra = sorted(observed_task_ids - expected_task_ids) + failures.append( + f"simulation task ids do not match tasks: missing={missing[:10]} extra={extra[:10]}" + ) + expected_pairs = { + (task_id, trial) for task_id in expected_task_ids for trial in range(expected_trials) + } + observed_pairs = [ + (str(sim.get("task_id")), int(sim.get("trial", 0))) for sim in simulations + ] + duplicate_pairs = sorted( + pair for pair, count in Counter(observed_pairs).items() if count != 1 + ) + missing_pairs = sorted(expected_pairs - set(observed_pairs)) + if duplicate_pairs or missing_pairs: + failures.append( + "simulation task/trial coverage mismatch: " + f"missing={missing_pairs[:10]} duplicate={duplicate_pairs[:10]}" + ) + + for sim in simulations: + info = sim.get("info") or {} + termination_reason = str(sim.get("termination_reason") or "") + if info.get("failed_after_attempts") or "infrastructure_error" in termination_reason: + failures.append( + "task=" + f"{sim.get('task_id')} trial={sim.get('trial', 0)} " + f"termination={termination_reason} error={info.get('error') or info.get('error_type')}" + ) + elif not sim.get("messages"): + failures.append( + f"task={sim.get('task_id')} trial={sim.get('trial', 0)} has no messages" + ) + return failures + + +def assert_tau2_results_complete( + data: dict[str, Any], *, context: str, expected_trials: int = 1 +) -> None: + failures = tau2_result_failures(data, expected_trials=expected_trials) + if failures: + preview = "; ".join(failures[:5]) + more = f"; ... {len(failures) - 5} more" if len(failures) > 5 else "" + raise RuntimeError(f"{context} produced invalid TAU-2 results: {preview}{more}") + + def strategy_ids(config: dict[str, Any]) -> list[str]: strategies = config.get("strategies") or [] if not isinstance(strategies, list): @@ -219,9 +276,7 @@ def user_simulator_policy(config: dict[str, Any]) -> str: policy = config.get("eval", {}).get("user_simulator_policy", "official") policy = str(policy) if policy not in {"official", "confirmation_aware"}: - raise ValueError( - "eval.user_simulator_policy must be 'official' or 'confirmation_aware'" - ) + raise ValueError("eval.user_simulator_policy must be 'official' or 'confirmation_aware'") return policy From a624451bd6b41ba654b0a69618a352c66df7594b Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Thu, 14 May 2026 01:53:39 +0800 Subject: [PATCH 16/42] bench(tau2): summarize category trace coverage --- benchmark/tau2/scripts/run_memory_v2_eval.py | 110 +++++++++++++++++++ tests/benchmark/test_tau2_category_rerank.py | 54 ++++++++- 2 files changed, 163 insertions(+), 1 deletion(-) diff --git a/benchmark/tau2/scripts/run_memory_v2_eval.py b/benchmark/tau2/scripts/run_memory_v2_eval.py index ea8cc14f75..f41c6bed83 100644 --- a/benchmark/tau2/scripts/run_memory_v2_eval.py +++ b/benchmark/tau2/scripts/run_memory_v2_eval.py @@ -7,6 +7,7 @@ import shutil import sys import time +from collections import Counter from copy import deepcopy from pathlib import Path from typing import Any @@ -194,6 +195,114 @@ def _metrics(results_path: Path) -> dict[str, Any]: } +def _trace_category_summary(trace_path: Path) -> dict[str, Any]: + counters: Counter[str] = Counter() + decision_nodes: Counter[str] = Counter() + category_decisions: Counter[str] = Counter() + query_category_sources: Counter[str] = Counter() + memory_category_sources: Counter[str] = Counter() + selected_memory_category_sources: Counter[str] = Counter() + tool_calls: Counter[str] = Counter() + trace_rows = 0 + category_event_count = 0 + + if not trace_path.is_file(): + return { + "trace_present": False, + "trace_rows": 0, + "category_event_count": 0, + } + + for line_number, line in enumerate(trace_path.read_text(encoding="utf-8").splitlines(), 1): + if not line.strip(): + continue + trace_rows += 1 + try: + row = json.loads(line) + except json.JSONDecodeError: + counters["json_decode_error_count"] += 1 + counters[f"json_decode_error_line:{line_number}"] += 1 + continue + if not isinstance(row, dict): + counters["non_object_row_count"] += 1 + continue + + decision_nodes[str(row.get("decision_node") or "unknown")] += 1 + for call in row.get("tool_calls") or []: + if isinstance(call, dict) and call.get("name"): + tool_calls[str(call["name"])] += 1 + + category = row.get("category_rerank") if isinstance(row.get("category_rerank"), dict) else {} + if category: + category_event_count += 1 + if category.get("enabled"): + counters["category_enabled_event_count"] += 1 + if category.get("applied"): + counters["category_applied_event_count"] += 1 + if category.get("decision"): + category_decisions[str(category["decision"])] += 1 + query_category = ( + category.get("query_category") + if isinstance(category.get("query_category"), dict) + else {} + ) + if query_category.get("category_source"): + query_category_sources[str(query_category["category_source"])] += 1 + if query_category.get("matched"): + counters["query_category_matched_event_count"] += 1 + + for match in row.get("matches") or []: + if not isinstance(match, dict): + continue + counters["raw_match_count"] += 1 + selected = bool(match.get("selected_for_injection") or match.get("injected")) + if selected: + counters["selected_match_count"] += 1 + memory_source = match.get("memory_category_source_prompt") + if memory_source: + counters["memory_category_present_count"] += 1 + memory_category_sources[str(memory_source)] += 1 + if selected: + counters["selected_memory_category_present_count"] += 1 + selected_memory_category_sources[str(memory_source)] += 1 + elif match.get("category_rerank_reasons") is not None: + counters["memory_category_missing_count"] += 1 + if match.get("category1_match") or match.get("category2_match"): + counters["positive_category_match_count"] += 1 + if selected: + counters["selected_positive_category_match_count"] += 1 + + raw_count = counters["raw_match_count"] + selected_count = counters["selected_match_count"] + return { + "trace_present": True, + "trace_rows": trace_rows, + "category_event_count": category_event_count, + "counts": dict(counters), + "decision_nodes": dict(decision_nodes), + "category_decisions": dict(category_decisions), + "query_category_sources": dict(query_category_sources), + "memory_category_sources": dict(memory_category_sources), + "selected_memory_category_sources": dict(selected_memory_category_sources), + "tool_calls": dict(tool_calls), + "rates": { + "memory_category_candidate_coverage": ( + counters["memory_category_present_count"] / raw_count if raw_count else None + ), + "selected_memory_category_coverage": ( + counters["selected_memory_category_present_count"] / selected_count + if selected_count + else None + ), + "selected_positive_category_match_rate": ( + counters["selected_positive_category_match_count"] / selected_count + if selected_count + else None + ), + }, + } + + def _tool_call_name(tool_call: Any) -> str: if isinstance(tool_call, dict): return str(tool_call.get("name") or tool_call.get("function", {}).get("name") or "") @@ -829,6 +938,7 @@ def main() -> int: "scope_prompt": args.scope_prompt_summary, "eval_results": str(eval_results), "retrieval_trace": str(trace_path), + "retrieval_trace_summary": _trace_category_summary(trace_path), "metrics": _metrics(eval_results), } _write_json(summary_path, summary) diff --git a/tests/benchmark/test_tau2_category_rerank.py b/tests/benchmark/test_tau2_category_rerank.py index 14d8693ead..494805f6b3 100644 --- a/tests/benchmark/test_tau2_category_rerank.py +++ b/tests/benchmark/test_tau2_category_rerank.py @@ -1,7 +1,8 @@ +import json from pathlib import Path from benchmark.tau2.scripts.category_rerank import CategoryReranker -from benchmark.tau2.scripts.run_memory_v2_eval import _load_scope_prompt +from benchmark.tau2.scripts.run_memory_v2_eval import _load_scope_prompt, _trace_category_summary def _reranker() -> CategoryReranker: @@ -117,3 +118,54 @@ def test_scope_prompt_skips_unconfigured_domain(tmp_path: Path) -> None: assert text == "" assert summary["loaded"] is False assert summary["skipped_reason"] == "no_domain_scope_prompt" + + +def test_trace_category_summary_counts_runtime_sources(tmp_path: Path) -> None: + trace = tmp_path / "retrieval_trace.jsonl" + rows = [ + { + "decision_node": "static_scope_prompt", + "retrieval_action_taken": "scope_prompt_static_injection", + "injected": True, + }, + { + "decision_node": "before_write_tool_call", + "retrieval_action_taken": "retrieve_and_inject", + "tool_calls": [{"name": "exchange_delivered_order_items"}], + "category_rerank": { + "enabled": True, + "applied": True, + "decision": "soft_reranked_keep_category2_matches", + "query_category": { + "matched": True, + "category_source": "tau2_category_catalog_keyword_match", + }, + }, + "matches": [ + { + "uri": "a", + "selected_for_injection": True, + "memory_category_source_prompt": "tau2_category_catalog_keyword_match", + "category2_match": True, + }, + { + "uri": "b", + "selected_for_injection": False, + "category_rerank_reasons": ["missing_memory_category"], + }, + ], + }, + ] + trace.write_text( + "\n".join(json.dumps(row, sort_keys=True) for row in rows) + "\n" + ) + + summary = _trace_category_summary(trace) + + assert summary["trace_present"] is True + assert summary["decision_nodes"]["before_write_tool_call"] == 1 + assert summary["category_decisions"]["soft_reranked_keep_category2_matches"] == 1 + assert summary["query_category_sources"]["tau2_category_catalog_keyword_match"] == 1 + assert summary["selected_memory_category_sources"]["tau2_category_catalog_keyword_match"] == 1 + assert summary["tool_calls"]["exchange_delivered_order_items"] == 1 + assert summary["rates"]["selected_memory_category_coverage"] == 1.0 From 63a100481be6da73e889d1c8373da0100052e7aa Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Thu, 14 May 2026 01:54:45 +0800 Subject: [PATCH 17/42] fix(benchmark): resolve tau2 runner paths --- benchmark/tau2/scripts/run_memory_v2_eval.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/benchmark/tau2/scripts/run_memory_v2_eval.py b/benchmark/tau2/scripts/run_memory_v2_eval.py index bc1fd026af..c26c347ffa 100644 --- a/benchmark/tau2/scripts/run_memory_v2_eval.py +++ b/benchmark/tau2/scripts/run_memory_v2_eval.py @@ -631,6 +631,9 @@ def main() -> int: normalize_litellm_env() args.tau2_repo = args.tau2_repo.resolve() + args.run_dir = args.run_dir.resolve() + if args.corpus_dir is not None: + args.corpus_dir = args.corpus_dir.resolve() args.run_dir.mkdir(parents=True, exist_ok=True) corpus_dir = args.corpus_dir or args.run_dir corpus_dir.mkdir(parents=True, exist_ok=True) From 05932ddb8e02d983168d4697b80e8bd0a14dfb12 Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Thu, 14 May 2026 02:51:24 +0800 Subject: [PATCH 18/42] docs(tau2): document category trace summary --- benchmark/tau2/README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/benchmark/tau2/README.md b/benchmark/tau2/README.md index e5a6874c11..077c306487 100644 --- a/benchmark/tau2/README.md +++ b/benchmark/tau2/README.md @@ -168,6 +168,11 @@ names, retrieved trajectory text, and memory URIs; no Harness sidecar artifact is required. Retrieval traces include the query category, candidate memory categories, rerank reasons, selected rows, skipped rows, scope prompt metadata, and flat `*_category*_prompt` fields kept compatible with Harness diagnostics. +Each run summary also includes `retrieval_trace_summary`, a compact rollup of +decision nodes, category decisions, query/memory category sources, selected +category coverage, and write tool calls. Use it as the first check that a run is +using this branch's self-generated category signal before opening the JSONL +trace. ## User Simulator Policy From fb62c46148ad87e8b130235e9aba7994e358e3e4 Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Thu, 14 May 2026 02:52:51 +0800 Subject: [PATCH 19/42] fix(memory): add trajectory evidence examples --- openviking/prompts/templates/memory/trajectories.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/openviking/prompts/templates/memory/trajectories.yaml b/openviking/prompts/templates/memory/trajectories.yaml index a16b7485f9..b467b17b39 100644 --- a/openviking/prompts/templates/memory/trajectories.yaml +++ b/openviking/prompts/templates/memory/trajectories.yaml @@ -59,6 +59,8 @@ fields: - Keep the memory grounded in this session, but abstract away user-specific names, raw IDs, exact payloads, and long tool responses. - Do not include raw user/order/reservation/payment/card IDs, user names, email/phone/address values, exact dates, case-specific amounts, exact budgets, route pairs, airport pairs, passenger/bag counts, card suffixes, flight numbers, order numbers, or raw tool payloads. Replace them with semantic descriptions such as "a delivered order", "an ineligible basic-economy reservation", "the saved payment method", or "a policy-ineligible cancellation". Stable policy constants may be kept when they are needed for future execution. - The Evidence line is not an exception: it may name the source status and lesson type, but must not carry case-specific values such as exact amounts, routes, dates, counts, product names, or customer-specific state. + - Bad Evidence: "a mechanical keyboard exceeded the $200 threshold". Good Evidence: "a product variant exceeded the user-specified threshold". + - Bad Evidence: "the ORD-to-LAX reservation on May 12 used card ending 1234". Good Evidence: "a matching reservation required the saved payment method and policy checks before the write action". - Mention tool names when they are part of the reusable path, but summarize observations instead of copying raw JSON. - If the session failed or was partial, still write the best reusable lesson: put the corrected approach in Procedure and the failure cause in Anti-patterns / Evidence. - Avoid broad SOPs. The Trigger and Applicability Boundary should make this record narrower than a whole domain workflow. From b613e3ec677e27fe18a8541d5420bb67f56f3816 Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Thu, 14 May 2026 02:55:52 +0800 Subject: [PATCH 20/42] bench(tau2): align trajectory baseline guard --- benchmark/tau2/scripts/tau2_common.py | 26 +++++++++++++++++++ .../templates/memory/trajectories.yaml | 2 ++ 2 files changed, 28 insertions(+) diff --git a/benchmark/tau2/scripts/tau2_common.py b/benchmark/tau2/scripts/tau2_common.py index 7a1dfd1d93..4f4505bd87 100755 --- a/benchmark/tau2/scripts/tau2_common.py +++ b/benchmark/tau2/scripts/tau2_common.py @@ -5,6 +5,7 @@ import re import shutil import subprocess +from collections import Counter from datetime import datetime, timezone from pathlib import Path from typing import Any @@ -126,9 +127,34 @@ def tau2_result_failures(data: dict[str, Any], *, expected_trials: int = 1) -> l simulations = data.get("simulations") or [] failures: list[str] = [] if tasks: + expected_task_ids = { + str(task.get("id", task.get("task_id"))) for task in tasks if isinstance(task, dict) + } + observed_task_ids = {str(sim.get("task_id")) for sim in simulations} expected = len(tasks) * expected_trials if len(simulations) != expected: failures.append(f"expected {expected} simulations, found {len(simulations)}") + if observed_task_ids != expected_task_ids: + missing = sorted(expected_task_ids - observed_task_ids) + extra = sorted(observed_task_ids - expected_task_ids) + failures.append( + f"simulation task ids do not match tasks: missing={missing[:10]} extra={extra[:10]}" + ) + expected_pairs = { + (task_id, trial) for task_id in expected_task_ids for trial in range(expected_trials) + } + observed_pairs = [ + (str(sim.get("task_id")), int(sim.get("trial", 0))) for sim in simulations + ] + duplicate_pairs = sorted( + pair for pair, count in Counter(observed_pairs).items() if count != 1 + ) + missing_pairs = sorted(expected_pairs - set(observed_pairs)) + if duplicate_pairs or missing_pairs: + failures.append( + "simulation task/trial coverage mismatch: " + f"missing={missing_pairs[:10]} duplicate={duplicate_pairs[:10]}" + ) for sim in simulations: info = sim.get("info") or {} diff --git a/openviking/prompts/templates/memory/trajectories.yaml b/openviking/prompts/templates/memory/trajectories.yaml index a16b7485f9..b467b17b39 100644 --- a/openviking/prompts/templates/memory/trajectories.yaml +++ b/openviking/prompts/templates/memory/trajectories.yaml @@ -59,6 +59,8 @@ fields: - Keep the memory grounded in this session, but abstract away user-specific names, raw IDs, exact payloads, and long tool responses. - Do not include raw user/order/reservation/payment/card IDs, user names, email/phone/address values, exact dates, case-specific amounts, exact budgets, route pairs, airport pairs, passenger/bag counts, card suffixes, flight numbers, order numbers, or raw tool payloads. Replace them with semantic descriptions such as "a delivered order", "an ineligible basic-economy reservation", "the saved payment method", or "a policy-ineligible cancellation". Stable policy constants may be kept when they are needed for future execution. - The Evidence line is not an exception: it may name the source status and lesson type, but must not carry case-specific values such as exact amounts, routes, dates, counts, product names, or customer-specific state. + - Bad Evidence: "a mechanical keyboard exceeded the $200 threshold". Good Evidence: "a product variant exceeded the user-specified threshold". + - Bad Evidence: "the ORD-to-LAX reservation on May 12 used card ending 1234". Good Evidence: "a matching reservation required the saved payment method and policy checks before the write action". - Mention tool names when they are part of the reusable path, but summarize observations instead of copying raw JSON. - If the session failed or was partial, still write the best reusable lesson: put the corrected approach in Procedure and the failure cause in Anti-patterns / Evidence. - Avoid broad SOPs. The Trigger and Applicability Boundary should make this record narrower than a whole domain workflow. From cc1f0099ddbf3931b2e050be47061da7ee580063 Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Thu, 14 May 2026 04:00:06 +0800 Subject: [PATCH 21/42] bench(tau2): report concrete memory trace coverage --- benchmark/tau2/scripts/run_memory_v2_eval.py | 27 ++++++++++++++++++++ tests/benchmark/test_tau2_category_rerank.py | 8 ++++-- 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/benchmark/tau2/scripts/run_memory_v2_eval.py b/benchmark/tau2/scripts/run_memory_v2_eval.py index f41c6bed83..ce076fe76e 100644 --- a/benchmark/tau2/scripts/run_memory_v2_eval.py +++ b/benchmark/tau2/scripts/run_memory_v2_eval.py @@ -196,6 +196,10 @@ def _metrics(results_path: Path) -> dict[str, Any]: def _trace_category_summary(trace_path: Path) -> dict[str, Any]: + def is_aggregate_memory_uri(uri: Any) -> bool: + value = str(uri or "").split("#", 1)[0] + return value.endswith("/.overview.md") or value.endswith("/.abstract.md") + counters: Counter[str] = Counter() decision_nodes: Counter[str] = Counter() category_decisions: Counter[str] = Counter() @@ -258,6 +262,14 @@ def _trace_category_summary(trace_path: Path) -> dict[str, Any]: selected = bool(match.get("selected_for_injection") or match.get("injected")) if selected: counters["selected_match_count"] += 1 + if is_aggregate_memory_uri(match.get("uri")): + counters["aggregate_memory_candidate_count"] += 1 + if selected: + counters["selected_aggregate_memory_count"] += 1 + else: + counters["concrete_memory_candidate_count"] += 1 + if selected: + counters["selected_concrete_memory_count"] += 1 memory_source = match.get("memory_category_source_prompt") if memory_source: counters["memory_category_present_count"] += 1 @@ -274,6 +286,13 @@ def _trace_category_summary(trace_path: Path) -> dict[str, Any]: raw_count = counters["raw_match_count"] selected_count = counters["selected_match_count"] + for key in [ + "aggregate_memory_candidate_count", + "concrete_memory_candidate_count", + "selected_aggregate_memory_count", + "selected_concrete_memory_count", + ]: + counters[key] += 0 return { "trace_present": True, "trace_rows": trace_rows, @@ -299,6 +318,14 @@ def _trace_category_summary(trace_path: Path) -> dict[str, Any]: if selected_count else None ), + "concrete_memory_candidate_rate": ( + counters["concrete_memory_candidate_count"] / raw_count if raw_count else None + ), + "selected_concrete_memory_rate": ( + counters["selected_concrete_memory_count"] / selected_count + if selected_count + else None + ), }, } diff --git a/tests/benchmark/test_tau2_category_rerank.py b/tests/benchmark/test_tau2_category_rerank.py index 494805f6b3..b0f616a3f9 100644 --- a/tests/benchmark/test_tau2_category_rerank.py +++ b/tests/benchmark/test_tau2_category_rerank.py @@ -143,13 +143,13 @@ def test_trace_category_summary_counts_runtime_sources(tmp_path: Path) -> None: }, "matches": [ { - "uri": "a", + "uri": "viking://agent/example/memories/trajectories/.overview.md", "selected_for_injection": True, "memory_category_source_prompt": "tau2_category_catalog_keyword_match", "category2_match": True, }, { - "uri": "b", + "uri": "viking://agent/example/memories/trajectories/concrete.md", "selected_for_injection": False, "category_rerank_reasons": ["missing_memory_category"], }, @@ -169,3 +169,7 @@ def test_trace_category_summary_counts_runtime_sources(tmp_path: Path) -> None: assert summary["selected_memory_category_sources"]["tau2_category_catalog_keyword_match"] == 1 assert summary["tool_calls"]["exchange_delivered_order_items"] == 1 assert summary["rates"]["selected_memory_category_coverage"] == 1.0 + assert summary["counts"]["aggregate_memory_candidate_count"] == 1 + assert summary["counts"]["concrete_memory_candidate_count"] == 1 + assert summary["rates"]["concrete_memory_candidate_rate"] == 0.5 + assert summary["rates"]["selected_concrete_memory_rate"] == 0.0 From c0aa47a5d8fcce93b23314c34796872a6e986a20 Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Thu, 14 May 2026 04:04:47 +0800 Subject: [PATCH 22/42] bench(tau2): gate diagnostic category evidence --- benchmark/tau2/README.md | 11 ++- benchmark/tau2/scripts/run_eval.py | 91 +++++++++++++++----- benchmark/tau2/scripts/run_memory_v2_eval.py | 47 +++++++++- tests/benchmark/test_tau2_category_rerank.py | 70 ++++++++++++++- 4 files changed, 193 insertions(+), 26 deletions(-) diff --git a/benchmark/tau2/README.md b/benchmark/tau2/README.md index 077c306487..c8ff5d5c67 100644 --- a/benchmark/tau2/README.md +++ b/benchmark/tau2/README.md @@ -170,9 +170,14 @@ categories, rerank reasons, selected rows, skipped rows, scope prompt metadata, and flat `*_category*_prompt` fields kept compatible with Harness diagnostics. Each run summary also includes `retrieval_trace_summary`, a compact rollup of decision nodes, category decisions, query/memory category sources, selected -category coverage, and write tool calls. Use it as the first check that a run is -using this branch's self-generated category signal before opening the JSONL -trace. +category coverage, aggregate-vs-concrete memory candidate coverage, and write +tool calls. Use it as the first check that a run is using this branch's +self-generated category signal before opening the JSONL trace. Category runs +whose runtime trace has only aggregate `.overview.md` / `.abstract.md` +candidates, no memory category coverage, or no selected positive category match +are marked `runtime_evidence.status=diagnostic`; `scoreboard.json` excludes +those diagnostic cells from the main reward/DB aggregates while preserving their +metrics and artifacts for debugging. ## User Simulator Policy diff --git a/benchmark/tau2/scripts/run_eval.py b/benchmark/tau2/scripts/run_eval.py index 130c421ed9..763c8c2ba2 100755 --- a/benchmark/tau2/scripts/run_eval.py +++ b/benchmark/tau2/scripts/run_eval.py @@ -10,22 +10,40 @@ from pathlib import Path from typing import Any -from tau2_common import ( - assert_tau2_results_complete, - domains, - load_config, - normalize_litellm_env, - output_dir, - run_id, - simulator_policy_report, - split_file, - strategy_ids, - tau2_cli, - tau2_context, - tau2_repo, - user_simulator_policy, - write_json, -) +try: + from tau2_common import ( + assert_tau2_results_complete, + domains, + load_config, + normalize_litellm_env, + output_dir, + run_id, + simulator_policy_report, + split_file, + strategy_ids, + tau2_cli, + tau2_context, + tau2_repo, + user_simulator_policy, + write_json, + ) +except ModuleNotFoundError: # pragma: no cover - package import path + from .tau2_common import ( + assert_tau2_results_complete, + domains, + load_config, + normalize_litellm_env, + output_dir, + run_id, + simulator_policy_report, + split_file, + strategy_ids, + tau2_cli, + tau2_context, + tau2_repo, + user_simulator_policy, + write_json, + ) REPO_ROOT = Path(__file__).resolve().parents[3] @@ -345,6 +363,28 @@ def _cell_metrics(cell: dict[str, Any], artifacts: dict[str, str]) -> dict[str, return _metrics_from_tau2_results(results_path) +def _cell_runtime_evidence(cell: dict[str, Any], artifacts: dict[str, str]) -> dict[str, Any]: + if cell.get("memory_backend") == "openviking": + summary_path = Path(artifacts["summary"]) + if not summary_path.is_file(): + return {"status": "missing", "reasons": ["missing_summary"]} + summary = json.loads(summary_path.read_text(encoding="utf-8")) + evidence = summary.get("runtime_evidence") + if isinstance(evidence, dict): + return { + "status": str(evidence.get("status") or "valid"), + "reasons": list(evidence.get("reasons") or []), + } + return {"status": "valid", "reasons": []} + + +def _row_is_valid_evidence(row: dict[str, Any]) -> bool: + evidence = row.get("runtime_evidence") + if not isinstance(evidence, dict): + return True + return str(evidence.get("status") or "valid") == "valid" + + def _memory_corpus_key(cell: dict[str, Any]) -> str: corpus_id = str(cell.get("corpus_id") or cell["strategy_id"]) return f"{cell['domain']}_{corpus_id}" @@ -409,15 +449,22 @@ def _prepare_memory_corpora(plan: dict[str, Any], repo: Path, out: Path) -> list def _summarize(rows: list[dict[str, Any]]) -> dict[str, Any]: def weighted(rows_for_group: list[dict[str, Any]]) -> dict[str, Any]: metric_rows = [row for row in rows_for_group if row.get("metrics")] + valid_metric_rows = [row for row in metric_rows if _row_is_valid_evidence(row)] + diagnostic_rows = [ + row for row in metric_rows if not _row_is_valid_evidence(row) + ] sim_count = sum(int(row["metrics"].get("simulation_count") or 0) for row in metric_rows) + valid_sim_count = sum( + int(row["metrics"].get("simulation_count") or 0) for row in valid_metric_rows + ) reward_sum = sum( float(row["metrics"].get("avg_reward") or 0.0) * int(row["metrics"].get("simulation_count") or 0) - for row in metric_rows + for row in valid_metric_rows ) db_weighted_rows = [ row - for row in metric_rows + for row in valid_metric_rows if row["metrics"].get("db_match_rate") is not None and int(row["metrics"].get("simulation_count") or 0) > 0 ] @@ -432,8 +479,11 @@ def weighted(rows_for_group: list[dict[str, Any]]) -> dict[str, Any]: return { "cell_count": len(rows_for_group), "completed_cell_count": len(metric_rows), - "simulation_count": sim_count, - "avg_reward": reward_sum / sim_count if sim_count else None, + "valid_completed_cell_count": len(valid_metric_rows), + "diagnostic_cell_count": len(diagnostic_rows), + "diagnostic_simulation_count": sim_count - valid_sim_count, + "simulation_count": valid_sim_count, + "avg_reward": reward_sum / valid_sim_count if valid_sim_count else None, "db_match_rate": db_sum / db_weight if db_weight else None, } @@ -497,6 +547,7 @@ def _execute_cells(plan: dict[str, Any], repo: Path, out: Path) -> list[dict[str } row["artifacts"] = _cell_artifacts(cell, repo, out) row["metrics"] = _cell_metrics(cell, row["artifacts"]) + row["runtime_evidence"] = _cell_runtime_evidence(cell, row["artifacts"]) rows.append(row) write_json(out / "cell_results" / f"{cell['run_label']}.json", row) if completed.returncode != 0: diff --git a/benchmark/tau2/scripts/run_memory_v2_eval.py b/benchmark/tau2/scripts/run_memory_v2_eval.py index ce076fe76e..6fc18a7cc2 100644 --- a/benchmark/tau2/scripts/run_memory_v2_eval.py +++ b/benchmark/tau2/scripts/run_memory_v2_eval.py @@ -330,6 +330,43 @@ def is_aggregate_memory_uri(uri: Any) -> bool: } +def _runtime_evidence_status( + *, + category_rerank: dict[str, Any], + retrieval_trace_summary: dict[str, Any], +) -> dict[str, Any]: + reasons: list[str] = [] + if category_rerank.get("enabled"): + if not retrieval_trace_summary.get("trace_present"): + reasons.append("missing_retrieval_trace") + counts = ( + retrieval_trace_summary.get("counts") + if isinstance(retrieval_trace_summary.get("counts"), dict) + else {} + ) + rates = ( + retrieval_trace_summary.get("rates") + if isinstance(retrieval_trace_summary.get("rates"), dict) + else {} + ) + if int(counts.get("category_applied_event_count") or 0) > 0: + if float(rates.get("concrete_memory_candidate_rate") or 0.0) <= 0.0: + reasons.append("no_concrete_memory_candidates") + if int(counts.get("memory_category_present_count") or 0) <= 0: + reasons.append("no_memory_category_coverage") + if ( + int(counts.get("query_category_matched_event_count") or 0) > 0 + and float(rates.get("selected_positive_category_match_rate") or 0.0) + <= 0.0 + ): + reasons.append("no_selected_positive_category_match") + + return { + "status": "diagnostic" if reasons else "valid", + "reasons": reasons, + } + + def _tool_call_name(tool_call: Any) -> str: if isinstance(tool_call, dict): return str(tool_call.get("name") or tool_call.get("function", {}).get("name") or "") @@ -954,6 +991,8 @@ def main() -> int: assert_tau2_results_complete( json.loads(eval_results.read_text()), context=f"{args.domain} eval" ) + category_summary = args.category_reranker.summary() + trace_summary = _trace_category_summary(trace_path) summary = { "run_label": args.run_label, "domain": args.domain, @@ -961,11 +1000,15 @@ def main() -> int: "retrieval_mode": args.retrieval_mode, "seed": args.seed, "corpus": corpus, - "category_rerank": args.category_reranker.summary(), + "category_rerank": category_summary, "scope_prompt": args.scope_prompt_summary, "eval_results": str(eval_results), "retrieval_trace": str(trace_path), - "retrieval_trace_summary": _trace_category_summary(trace_path), + "retrieval_trace_summary": trace_summary, + "runtime_evidence": _runtime_evidence_status( + category_rerank=category_summary, + retrieval_trace_summary=trace_summary, + ), "metrics": _metrics(eval_results), } _write_json(summary_path, summary) diff --git a/tests/benchmark/test_tau2_category_rerank.py b/tests/benchmark/test_tau2_category_rerank.py index b0f616a3f9..64e86766dd 100644 --- a/tests/benchmark/test_tau2_category_rerank.py +++ b/tests/benchmark/test_tau2_category_rerank.py @@ -2,7 +2,12 @@ from pathlib import Path from benchmark.tau2.scripts.category_rerank import CategoryReranker -from benchmark.tau2.scripts.run_memory_v2_eval import _load_scope_prompt, _trace_category_summary +from benchmark.tau2.scripts.run_eval import _summarize +from benchmark.tau2.scripts.run_memory_v2_eval import ( + _load_scope_prompt, + _runtime_evidence_status, + _trace_category_summary, +) def _reranker() -> CategoryReranker: @@ -173,3 +178,66 @@ def test_trace_category_summary_counts_runtime_sources(tmp_path: Path) -> None: assert summary["counts"]["concrete_memory_candidate_count"] == 1 assert summary["rates"]["concrete_memory_candidate_rate"] == 0.5 assert summary["rates"]["selected_concrete_memory_rate"] == 0.0 + + +def test_runtime_evidence_marks_aggregate_only_category_diagnostic() -> None: + evidence = _runtime_evidence_status( + category_rerank={"enabled": True}, + retrieval_trace_summary={ + "trace_present": True, + "counts": { + "category_applied_event_count": 1, + "query_category_matched_event_count": 1, + "memory_category_present_count": 1, + }, + "rates": { + "concrete_memory_candidate_rate": 0.0, + "selected_positive_category_match_rate": 0.0, + }, + }, + ) + + assert evidence["status"] == "diagnostic" + assert "no_concrete_memory_candidates" in evidence["reasons"] + assert "no_selected_positive_category_match" in evidence["reasons"] + + +def test_scoreboard_excludes_diagnostic_runtime_evidence() -> None: + scoreboard = _summarize( + [ + { + "domain": "airline", + "strategy_id": "memory_v2_trajectory_category_prewrite", + "metrics": { + "simulation_count": 1, + "avg_reward": 1.0, + "db_match_rate": 1.0, + }, + "runtime_evidence": { + "status": "diagnostic", + "reasons": ["no_concrete_memory_candidates"], + }, + }, + { + "domain": "airline", + "strategy_id": "memory_v2_trajectory_category_prewrite", + "metrics": { + "simulation_count": 1, + "avg_reward": 0.5, + "db_match_rate": 0.0, + }, + "runtime_evidence": {"status": "valid", "reasons": []}, + }, + ] + ) + + domain = scoreboard["strategies"]["memory_v2_trajectory_category_prewrite"][ + "domains" + ]["airline"] + assert domain["completed_cell_count"] == 2 + assert domain["valid_completed_cell_count"] == 1 + assert domain["diagnostic_cell_count"] == 1 + assert domain["diagnostic_simulation_count"] == 1 + assert domain["simulation_count"] == 1 + assert domain["avg_reward"] == 0.5 + assert domain["db_match_rate"] == 0.0 From 005627f9af9312763902b69e0d20936155cb231f Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Thu, 14 May 2026 04:33:02 +0800 Subject: [PATCH 23/42] bench(tau2): tighten category coverage diagnostics --- benchmark/tau2/README.md | 4 ++-- benchmark/tau2/scripts/run_memory_v2_eval.py | 16 ++++++++++++++++ tests/benchmark/test_tau2_category_rerank.py | 6 ++++++ 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/benchmark/tau2/README.md b/benchmark/tau2/README.md index c8ff5d5c67..6dfe1735f5 100644 --- a/benchmark/tau2/README.md +++ b/benchmark/tau2/README.md @@ -158,8 +158,8 @@ commits inside one corpus remain serial to preserve OpenViking write semantics. `config/category_rerank.yaml` keeps the PR-B trajectory memory route and enables an adapter-local FGMemory-style probe: pre-write recall, self-generated runtime category signals, and the retail scope prompt used by the Harness -High-TrajView/FGMemory route. The category sub-policy follows the S84 component -settings, but the alignment target is the red-box S89/FGMemory high result: +High-TrajView/FGMemory route. The alignment target is the red-box +S89/FGMemory high result: retrieve 6, keep same-category candidates, inject at most 2, skip injection when no positive category match exists, and apply the scope/applicability prompt at the system prompt injection point. Runtime categories are generated from the diff --git a/benchmark/tau2/scripts/run_memory_v2_eval.py b/benchmark/tau2/scripts/run_memory_v2_eval.py index 6fc18a7cc2..1375c5e918 100644 --- a/benchmark/tau2/scripts/run_memory_v2_eval.py +++ b/benchmark/tau2/scripts/run_memory_v2_eval.py @@ -277,6 +277,10 @@ def is_aggregate_memory_uri(uri: Any) -> bool: if selected: counters["selected_memory_category_present_count"] += 1 selected_memory_category_sources[str(memory_source)] += 1 + if match.get("memory_category1_prompt") or match.get("memory_category2_prompt"): + counters["memory_category_matched_count"] += 1 + if selected: + counters["selected_memory_category_matched_count"] += 1 elif match.get("category_rerank_reasons") is not None: counters["memory_category_missing_count"] += 1 if match.get("category1_match") or match.get("category2_match"): @@ -291,6 +295,8 @@ def is_aggregate_memory_uri(uri: Any) -> bool: "concrete_memory_candidate_count", "selected_aggregate_memory_count", "selected_concrete_memory_count", + "memory_category_matched_count", + "selected_memory_category_matched_count", ]: counters[key] += 0 return { @@ -313,6 +319,14 @@ def is_aggregate_memory_uri(uri: Any) -> bool: if selected_count else None ), + "memory_category_match_coverage": ( + counters["memory_category_matched_count"] / raw_count if raw_count else None + ), + "selected_memory_category_match_coverage": ( + counters["selected_memory_category_matched_count"] / selected_count + if selected_count + else None + ), "selected_positive_category_match_rate": ( counters["selected_positive_category_match_count"] / selected_count if selected_count @@ -354,6 +368,8 @@ def _runtime_evidence_status( reasons.append("no_concrete_memory_candidates") if int(counts.get("memory_category_present_count") or 0) <= 0: reasons.append("no_memory_category_coverage") + if int(counts.get("memory_category_matched_count") or 0) <= 0: + reasons.append("no_matched_memory_categories") if ( int(counts.get("query_category_matched_event_count") or 0) > 0 and float(rates.get("selected_positive_category_match_rate") or 0.0) diff --git a/tests/benchmark/test_tau2_category_rerank.py b/tests/benchmark/test_tau2_category_rerank.py index 64e86766dd..24290f1efc 100644 --- a/tests/benchmark/test_tau2_category_rerank.py +++ b/tests/benchmark/test_tau2_category_rerank.py @@ -151,6 +151,8 @@ def test_trace_category_summary_counts_runtime_sources(tmp_path: Path) -> None: "uri": "viking://agent/example/memories/trajectories/.overview.md", "selected_for_injection": True, "memory_category_source_prompt": "tau2_category_catalog_keyword_match", + "memory_category1_prompt": ["retail_order_post_shipment_service_request"], + "memory_category2_prompt": ["delivered_order_exchange"], "category2_match": True, }, { @@ -174,6 +176,8 @@ def test_trace_category_summary_counts_runtime_sources(tmp_path: Path) -> None: assert summary["selected_memory_category_sources"]["tau2_category_catalog_keyword_match"] == 1 assert summary["tool_calls"]["exchange_delivered_order_items"] == 1 assert summary["rates"]["selected_memory_category_coverage"] == 1.0 + assert summary["rates"]["memory_category_match_coverage"] == 0.5 + assert summary["rates"]["selected_memory_category_match_coverage"] == 1.0 assert summary["counts"]["aggregate_memory_candidate_count"] == 1 assert summary["counts"]["concrete_memory_candidate_count"] == 1 assert summary["rates"]["concrete_memory_candidate_rate"] == 0.5 @@ -189,6 +193,7 @@ def test_runtime_evidence_marks_aggregate_only_category_diagnostic() -> None: "category_applied_event_count": 1, "query_category_matched_event_count": 1, "memory_category_present_count": 1, + "memory_category_matched_count": 0, }, "rates": { "concrete_memory_candidate_rate": 0.0, @@ -199,6 +204,7 @@ def test_runtime_evidence_marks_aggregate_only_category_diagnostic() -> None: assert evidence["status"] == "diagnostic" assert "no_concrete_memory_candidates" in evidence["reasons"] + assert "no_matched_memory_categories" in evidence["reasons"] assert "no_selected_positive_category_match" in evidence["reasons"] From 96af30261cbfd81820b413955ac9aaa61b5b5fe1 Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Thu, 14 May 2026 05:11:51 +0800 Subject: [PATCH 24/42] bench(tau2): expose aggregate-only corpus probes --- benchmark/tau2/README.md | 4 +- benchmark/tau2/scripts/run_memory_v2_eval.py | 24 +++++++++--- tests/benchmark/test_tau2_category_rerank.py | 39 ++++++++++++++++++++ 3 files changed, 61 insertions(+), 6 deletions(-) diff --git a/benchmark/tau2/README.md b/benchmark/tau2/README.md index 6dfe1735f5..ad245ccaf1 100644 --- a/benchmark/tau2/README.md +++ b/benchmark/tau2/README.md @@ -177,7 +177,9 @@ whose runtime trace has only aggregate `.overview.md` / `.abstract.md` candidates, no memory category coverage, or no selected positive category match are marked `runtime_evidence.status=diagnostic`; `scoreboard.json` excludes those diagnostic cells from the main reward/DB aggregates while preserving their -metrics and artifacts for debugging. +metrics and artifacts for debugging. Corpus manifests also include +`corpus_probe.aggregate_match_count` and `corpus_probe.concrete_match_count` so +aggregate-only corpora can be spotted before reading the eval trace. ## User Simulator Policy diff --git a/benchmark/tau2/scripts/run_memory_v2_eval.py b/benchmark/tau2/scripts/run_memory_v2_eval.py index 1375c5e918..654e0c3254 100644 --- a/benchmark/tau2/scripts/run_memory_v2_eval.py +++ b/benchmark/tau2/scripts/run_memory_v2_eval.py @@ -195,11 +195,12 @@ def _metrics(results_path: Path) -> dict[str, Any]: } -def _trace_category_summary(trace_path: Path) -> dict[str, Any]: - def is_aggregate_memory_uri(uri: Any) -> bool: - value = str(uri or "").split("#", 1)[0] - return value.endswith("/.overview.md") or value.endswith("/.abstract.md") +def _is_aggregate_memory_uri(uri: Any) -> bool: + value = str(uri or "").split("#", 1)[0] + return value.endswith("/.overview.md") or value.endswith("/.abstract.md") + +def _trace_category_summary(trace_path: Path) -> dict[str, Any]: counters: Counter[str] = Counter() decision_nodes: Counter[str] = Counter() category_decisions: Counter[str] = Counter() @@ -262,7 +263,7 @@ def is_aggregate_memory_uri(uri: Any) -> bool: selected = bool(match.get("selected_for_injection") or match.get("injected")) if selected: counters["selected_match_count"] += 1 - if is_aggregate_memory_uri(match.get("uri")): + if _is_aggregate_memory_uri(match.get("uri")): counters["aggregate_memory_candidate_count"] += 1 if selected: counters["selected_aggregate_memory_count"] += 1 @@ -552,19 +553,32 @@ def _probe_corpus(args: argparse.Namespace, client: Any) -> dict[str, Any]: for match in memories[: args.retrieval_top_k]: uri = getattr(match, "uri", "") text, read_error = _read_memory_text(client, match) + is_aggregate = _is_aggregate_memory_uri(uri) row = { "uri": uri, "score": getattr(match, "score", None), "text_chars": len(text), "non_empty": bool(str(text).strip()), + "is_aggregate_memory": is_aggregate, + "is_concrete_memory": not is_aggregate, } if read_error: row["read_error"] = read_error reads.append(row) + aggregate_match_count = sum(1 for row in reads if row["is_aggregate_memory"]) + concrete_match_count = sum(1 for row in reads if row["is_concrete_memory"]) return { "query": f"{args.domain} customer service order reservation booking cancellation exchange return update", "match_count": len(memories), + "aggregate_match_count": aggregate_match_count, + "concrete_match_count": concrete_match_count, "read_non_empty_count": sum(1 for row in reads if row["non_empty"]), + "aggregate_read_non_empty_count": sum( + 1 for row in reads if row["is_aggregate_memory"] and row["non_empty"] + ), + "concrete_read_non_empty_count": sum( + 1 for row in reads if row["is_concrete_memory"] and row["non_empty"] + ), "matches": reads, } diff --git a/tests/benchmark/test_tau2_category_rerank.py b/tests/benchmark/test_tau2_category_rerank.py index 24290f1efc..17ab2921d7 100644 --- a/tests/benchmark/test_tau2_category_rerank.py +++ b/tests/benchmark/test_tau2_category_rerank.py @@ -1,10 +1,12 @@ import json from pathlib import Path +from types import SimpleNamespace from benchmark.tau2.scripts.category_rerank import CategoryReranker from benchmark.tau2.scripts.run_eval import _summarize from benchmark.tau2.scripts.run_memory_v2_eval import ( _load_scope_prompt, + _probe_corpus, _runtime_evidence_status, _trace_category_summary, ) @@ -247,3 +249,40 @@ def test_scoreboard_excludes_diagnostic_runtime_evidence() -> None: assert domain["simulation_count"] == 1 assert domain["avg_reward"] == 0.5 assert domain["db_match_rate"] == 0.0 + + +def test_probe_corpus_counts_aggregate_and_concrete_matches() -> None: + class FakeClient: + def search(self, **_: object) -> SimpleNamespace: + return SimpleNamespace( + memories=[ + SimpleNamespace( + uri="viking://agent/a/memories/trajectories/.overview.md", + score=0.2, + ), + SimpleNamespace( + uri="viking://agent/a/memories/trajectories/concrete.md#chunk_0001", + score=0.1, + ), + ] + ) + + def read(self, uri: str) -> str: + return f"body for {uri}" + + probe = _probe_corpus( + SimpleNamespace( + domain="airline", + search_uri="viking://agent/a/memories/trajectories", + retrieval_top_k=4, + ), + FakeClient(), + ) + + assert probe["match_count"] == 2 + assert probe["aggregate_match_count"] == 1 + assert probe["concrete_match_count"] == 1 + assert probe["aggregate_read_non_empty_count"] == 1 + assert probe["concrete_read_non_empty_count"] == 1 + assert probe["matches"][0]["is_aggregate_memory"] is True + assert probe["matches"][1]["is_concrete_memory"] is True From 1e96d6ceff3968792a09872859f2915abf86e552 Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Thu, 14 May 2026 05:41:02 +0800 Subject: [PATCH 25/42] bench(tau2): gate aggregate-only corpus probes --- benchmark/tau2/README.md | 4 +++- benchmark/tau2/scripts/run_memory_v2_eval.py | 12 ++++++++++++ tests/benchmark/test_tau2_category_rerank.py | 7 +++++++ 3 files changed, 22 insertions(+), 1 deletion(-) diff --git a/benchmark/tau2/README.md b/benchmark/tau2/README.md index ad245ccaf1..36b2431019 100644 --- a/benchmark/tau2/README.md +++ b/benchmark/tau2/README.md @@ -179,7 +179,9 @@ are marked `runtime_evidence.status=diagnostic`; `scoreboard.json` excludes those diagnostic cells from the main reward/DB aggregates while preserving their metrics and artifacts for debugging. Corpus manifests also include `corpus_probe.aggregate_match_count` and `corpus_probe.concrete_match_count` so -aggregate-only corpora can be spotted before reading the eval trace. +aggregate-only corpora can be spotted before reading the eval trace; category +runs whose corpus probe has matches but no concrete matches are also marked +diagnostic. ## User Simulator Policy diff --git a/benchmark/tau2/scripts/run_memory_v2_eval.py b/benchmark/tau2/scripts/run_memory_v2_eval.py index 654e0c3254..72da77a21f 100644 --- a/benchmark/tau2/scripts/run_memory_v2_eval.py +++ b/benchmark/tau2/scripts/run_memory_v2_eval.py @@ -349,9 +349,20 @@ def _runtime_evidence_status( *, category_rerank: dict[str, Any], retrieval_trace_summary: dict[str, Any], + corpus_probe: dict[str, Any] | None = None, ) -> dict[str, Any]: reasons: list[str] = [] if category_rerank.get("enabled"): + corpus_probe = corpus_probe if isinstance(corpus_probe, dict) else {} + if int(corpus_probe.get("match_count") or 0) > 0: + if int(corpus_probe.get("concrete_match_count") or 0) <= 0: + reasons.append("no_concrete_corpus_probe_matches") + if ( + int(corpus_probe.get("aggregate_match_count") or 0) + == int(corpus_probe.get("match_count") or 0) + ): + reasons.append("aggregate_only_corpus_probe") + if not retrieval_trace_summary.get("trace_present"): reasons.append("missing_retrieval_trace") counts = ( @@ -1038,6 +1049,7 @@ def main() -> int: "runtime_evidence": _runtime_evidence_status( category_rerank=category_summary, retrieval_trace_summary=trace_summary, + corpus_probe=corpus.get("corpus_probe") if isinstance(corpus, dict) else None, ), "metrics": _metrics(eval_results), } diff --git a/tests/benchmark/test_tau2_category_rerank.py b/tests/benchmark/test_tau2_category_rerank.py index 17ab2921d7..803843957a 100644 --- a/tests/benchmark/test_tau2_category_rerank.py +++ b/tests/benchmark/test_tau2_category_rerank.py @@ -189,6 +189,11 @@ def test_trace_category_summary_counts_runtime_sources(tmp_path: Path) -> None: def test_runtime_evidence_marks_aggregate_only_category_diagnostic() -> None: evidence = _runtime_evidence_status( category_rerank={"enabled": True}, + corpus_probe={ + "match_count": 1, + "aggregate_match_count": 1, + "concrete_match_count": 0, + }, retrieval_trace_summary={ "trace_present": True, "counts": { @@ -205,6 +210,8 @@ def test_runtime_evidence_marks_aggregate_only_category_diagnostic() -> None: ) assert evidence["status"] == "diagnostic" + assert "aggregate_only_corpus_probe" in evidence["reasons"] + assert "no_concrete_corpus_probe_matches" in evidence["reasons"] assert "no_concrete_memory_candidates" in evidence["reasons"] assert "no_matched_memory_categories" in evidence["reasons"] assert "no_selected_positive_category_match" in evidence["reasons"] From e30b79b973541e0fd6dc136d3a4e0f09ec361929 Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Thu, 14 May 2026 05:46:35 +0800 Subject: [PATCH 26/42] fix(benchmark): run no-memory tau2 eval in process --- benchmark/tau2/config/no_memory.yaml | 9 +++ benchmark/tau2/scripts/run_eval.py | 73 +++++++++++++------- benchmark/tau2/scripts/run_memory_v2_eval.py | 66 ++++++++++++++++-- 3 files changed, 117 insertions(+), 31 deletions(-) create mode 100644 benchmark/tau2/config/no_memory.yaml diff --git a/benchmark/tau2/config/no_memory.yaml b/benchmark/tau2/config/no_memory.yaml new file mode 100644 index 0000000000..93f35633b4 --- /dev/null +++ b/benchmark/tau2/config/no_memory.yaml @@ -0,0 +1,9 @@ +extends: baseline.yaml + +benchmark: + name: tau2_openviking_no_memory + +strategies: + - id: no_memory + label: TAU-2 no-memory baseline + memory_backend: none diff --git a/benchmark/tau2/scripts/run_eval.py b/benchmark/tau2/scripts/run_eval.py index 2fec4cbf0e..715c0d2556 100755 --- a/benchmark/tau2/scripts/run_eval.py +++ b/benchmark/tau2/scripts/run_eval.py @@ -4,6 +4,7 @@ import argparse import importlib.util import json +import os import subprocess import sys from concurrent.futures import ThreadPoolExecutor, as_completed @@ -165,38 +166,44 @@ def _tau2_command( return None command = [ - tau2_cli(config), - "run", + sys.executable, + str(Path(__file__).with_name("run_memory_v2_eval.py")), + "--tau2-repo", + str(tau2_repo(config)), + "--run-dir", + str(output_dir(config, configured_run_id) / "memory_cells" / run_label), + "--run-label", + run_label, + "--strategy-id", + strategy["id"], "--domain", domain, - "--agent", - str(benchmark.get("agent", "llm_agent")), - "--user", - str(benchmark.get("user", "user_simulator")), - "--task-split-name", + "--eval-split-name", str(benchmark.get("eval_split_name", "test")), - "--num-trials", - "1", "--max-steps", str(benchmark.get("max_steps", 200)), "--max-concurrency", str(benchmark.get("task_max_concurrency", 10)), + "--base-agent", + str(benchmark.get("agent", "llm_agent")), + "--user", + str(benchmark.get("user", "user_simulator")), "--agent-llm", str(model["agent_llm"]), "--user-llm", str(model["user_llm"]), - "--save-to", - run_label, + "--agent-llm-args", + agent_llm_args, + "--user-llm-args", + user_llm_args, "--seed", str(seed), + "--no-memory", ] - command.extend(["--agent-llm-args", agent_llm_args]) - command.extend(["--user-llm-args", user_llm_args]) - if task_ids: - command.append("--task-ids") - command.extend(task_ids) + for task_id in task_ids: + command.extend(["--task-id", task_id]) elif num_tasks is not None: command.extend(["--num-tasks", str(num_tasks)]) @@ -295,24 +302,25 @@ def _build_plan( def _cell_artifacts(cell: dict[str, Any], repo: Path, out: Path) -> dict[str, str]: - if cell.get("memory_backend") == "openviking": + if cell.get("memory_backend") in {"openviking", "none"}: run_dir = out / "memory_cells" / cell["run_label"] - corpus_id = str(cell.get("corpus_id") or cell["strategy_id"]) - corpus_dir = out / "memory_corpora" / f"{cell['domain']}_{corpus_id}" - return { + artifacts = { "summary": str(run_dir / f"{cell['run_label']}.summary.json"), "results": str(run_dir / f"{cell['run_label']}.json"), - "retrieval_trace": str(run_dir / f"{cell['run_label']}.retrieval_trace.jsonl"), - "corpus_manifest": str(corpus_dir / "corpus_manifest.json"), } + if cell.get("memory_backend") == "none": + return artifacts + corpus_id = str(cell.get("corpus_id") or cell["strategy_id"]) + corpus_dir = out / "memory_corpora" / f"{cell['domain']}_{corpus_id}" + artifacts["retrieval_trace"] = str(run_dir / f"{cell['run_label']}.retrieval_trace.jsonl") + artifacts["corpus_manifest"] = str(corpus_dir / "corpus_manifest.json") + return artifacts return {"results": str(repo / "data" / "simulations" / f"{cell['run_label']}.json")} def _cell_metrics(cell: dict[str, Any], artifacts: dict[str, str]) -> dict[str, Any] | None: - if cell.get("memory_backend") == "openviking": - summary_path = Path(artifacts["summary"]) - if not summary_path.is_file(): - return None + summary_path = Path(artifacts.get("summary", "")) + if summary_path.is_file(): summary = json.loads(summary_path.read_text(encoding="utf-8")) return summary.get("metrics") @@ -327,6 +335,17 @@ def _memory_corpus_key(cell: dict[str, Any]) -> str: return f"{cell['domain']}_{corpus_id}" +def _tau2_subprocess_env(repo: Path) -> dict[str, str]: + env = os.environ.copy() + src = repo / "src" + pythonpath_entry = str(src if src.is_dir() else repo) + existing = env.get("PYTHONPATH") + env["PYTHONPATH"] = ( + pythonpath_entry if not existing else f"{pythonpath_entry}{os.pathsep}{existing}" + ) + return env + + def _prepare_memory_corpus(cell: dict[str, Any], repo: Path, out: Path) -> dict[str, Any]: key = _memory_corpus_key(cell) command = list(cell["command"]) + ["--prepare-corpus-only"] @@ -334,6 +353,7 @@ def _prepare_memory_corpus(cell: dict[str, Any], repo: Path, out: Path) -> dict[ completed = subprocess.run( command, cwd=repo, + env=_tau2_subprocess_env(repo), text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, @@ -459,6 +479,7 @@ def _execute_cells(plan: dict[str, Any], repo: Path, out: Path) -> list[dict[str completed = subprocess.run( cell["command"], cwd=repo, + env=_tau2_subprocess_env(repo), text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, diff --git a/benchmark/tau2/scripts/run_memory_v2_eval.py b/benchmark/tau2/scripts/run_memory_v2_eval.py index c26c347ffa..f8835aace6 100644 --- a/benchmark/tau2/scripts/run_memory_v2_eval.py +++ b/benchmark/tau2/scripts/run_memory_v2_eval.py @@ -612,13 +612,13 @@ def main() -> int: parser.add_argument("--user-llm", required=True) parser.add_argument("--agent-llm-args", type=_json, default={}) parser.add_argument("--user-llm-args", type=_json, default={}) - parser.add_argument("--openviking-url", required=True) - parser.add_argument("--openviking-account", required=True) - parser.add_argument("--openviking-user", required=True) - parser.add_argument("--openviking-agent-id", required=True) + parser.add_argument("--openviking-url") + parser.add_argument("--openviking-account") + parser.add_argument("--openviking-user") + parser.add_argument("--openviking-agent-id") parser.add_argument("--openviking-timeout", type=float, default=600.0) parser.add_argument("--openviking-wait-timeout", type=int, default=600) - parser.add_argument("--search-uri", required=True) + parser.add_argument("--search-uri") parser.add_argument("--retrieval-top-k", type=int, default=4) parser.add_argument( "--retrieval-mode", @@ -627,8 +627,30 @@ def main() -> int: ) parser.add_argument("--force-train", action="store_true") parser.add_argument("--prepare-corpus-only", action="store_true") + parser.add_argument( + "--no-memory", + action="store_true", + help="Run the configured TAU-2 agent without OpenViking retrieval.", + ) args = parser.parse_args() normalize_litellm_env() + if not args.no_memory: + missing = [ + name + for name in ( + "openviking_url", + "openviking_account", + "openviking_user", + "openviking_agent_id", + "search_uri", + ) + if not getattr(args, name) + ] + if missing: + parser.error( + "OpenViking memory runs require: " + + ", ".join("--" + name.replace("_", "-") for name in missing) + ) args.tau2_repo = args.tau2_repo.resolve() args.run_dir = args.run_dir.resolve() @@ -643,6 +665,40 @@ def main() -> int: trace_path = args.run_dir / f"{args.run_label}.retrieval_trace.jsonl" summary_path = args.run_dir / f"{args.run_label}.summary.json" + if args.no_memory: + _run_tau2( + tau2_repo=args.tau2_repo, + domain=args.domain, + split=args.eval_split_name, + task_ids=args.task_ids, + num_tasks=args.num_tasks, + trials=1, + max_steps=args.max_steps, + max_concurrency=args.max_concurrency, + agent=args.base_agent, + user=args.user, + agent_llm=args.agent_llm, + user_llm=args.user_llm, + agent_llm_args=args.agent_llm_args, + user_llm_args=args.user_llm_args, + seed=args.seed, + save_to=eval_results, + ) + assert_tau2_results_complete( + json.loads(eval_results.read_text()), context=f"{args.domain} eval" + ) + summary = { + "run_label": args.run_label, + "domain": args.domain, + "strategy_id": args.strategy_id, + "seed": args.seed, + "eval_results": str(eval_results), + "metrics": _metrics(eval_results), + } + _write_json(summary_path, summary) + print(json.dumps(summary, ensure_ascii=False, sort_keys=True)) + return 0 + corpus = _train(args, train_results, corpus_manifest) if args.prepare_corpus_only: print( From 15f66e9c6c1fb759f2b88bf3d55caa63ac42835b Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Thu, 14 May 2026 06:12:57 +0800 Subject: [PATCH 27/42] bench(tau2): align corpus probe width with rerank --- benchmark/tau2/README.md | 6 +++-- benchmark/tau2/scripts/run_memory_v2_eval.py | 13 +++++++++-- tests/benchmark/test_tau2_category_rerank.py | 23 ++++++++++++++++++-- 3 files changed, 36 insertions(+), 6 deletions(-) diff --git a/benchmark/tau2/README.md b/benchmark/tau2/README.md index 36b2431019..ea1c32ffed 100644 --- a/benchmark/tau2/README.md +++ b/benchmark/tau2/README.md @@ -180,8 +180,10 @@ those diagnostic cells from the main reward/DB aggregates while preserving their metrics and artifacts for debugging. Corpus manifests also include `corpus_probe.aggregate_match_count` and `corpus_probe.concrete_match_count` so aggregate-only corpora can be spotted before reading the eval trace; category -runs whose corpus probe has matches but no concrete matches are also marked -diagnostic. +runs whose corpus probe is empty, or has matches but no concrete matches, are +also marked diagnostic. The corpus probe uses the category `retrieve_limit` +when category rerank is enabled, so the probe width matches the runtime +pre-write search width. ## User Simulator Policy diff --git a/benchmark/tau2/scripts/run_memory_v2_eval.py b/benchmark/tau2/scripts/run_memory_v2_eval.py index 72da77a21f..b06c9380aa 100644 --- a/benchmark/tau2/scripts/run_memory_v2_eval.py +++ b/benchmark/tau2/scripts/run_memory_v2_eval.py @@ -354,6 +354,8 @@ def _runtime_evidence_status( reasons: list[str] = [] if category_rerank.get("enabled"): corpus_probe = corpus_probe if isinstance(corpus_probe, dict) else {} + if corpus_probe and int(corpus_probe.get("match_count") or 0) <= 0: + reasons.append("empty_corpus_probe") if int(corpus_probe.get("match_count") or 0) > 0: if int(corpus_probe.get("concrete_match_count") or 0) <= 0: reasons.append("no_concrete_corpus_probe_matches") @@ -554,14 +556,20 @@ def _read_memory_text(client: Any, match: Any) -> tuple[str, str | None]: def _probe_corpus(args: argparse.Namespace, client: Any) -> dict[str, Any]: + probe_limit = args.retrieval_top_k + if hasattr(args, "category_reranker"): + probe_limit = args.category_reranker.search_limit( + probe_limit, + decision_node="before_write_tool_call", + ) result = client.search( query=f"{args.domain} customer service order reservation booking cancellation exchange return update", target_uri=args.search_uri, - limit=args.retrieval_top_k, + limit=probe_limit, ) memories = list(getattr(result, "memories", []) or []) reads = [] - for match in memories[: args.retrieval_top_k]: + for match in memories[:probe_limit]: uri = getattr(match, "uri", "") text, read_error = _read_memory_text(client, match) is_aggregate = _is_aggregate_memory_uri(uri) @@ -580,6 +588,7 @@ def _probe_corpus(args: argparse.Namespace, client: Any) -> dict[str, Any]: concrete_match_count = sum(1 for row in reads if row["is_concrete_memory"]) return { "query": f"{args.domain} customer service order reservation booking cancellation exchange return update", + "probe_limit": probe_limit, "match_count": len(memories), "aggregate_match_count": aggregate_match_count, "concrete_match_count": concrete_match_count, diff --git a/tests/benchmark/test_tau2_category_rerank.py b/tests/benchmark/test_tau2_category_rerank.py index 803843957a..79b2cbce6c 100644 --- a/tests/benchmark/test_tau2_category_rerank.py +++ b/tests/benchmark/test_tau2_category_rerank.py @@ -258,9 +258,24 @@ def test_scoreboard_excludes_diagnostic_runtime_evidence() -> None: assert domain["db_match_rate"] == 0.0 +def test_runtime_evidence_marks_empty_corpus_probe_diagnostic() -> None: + evidence = _runtime_evidence_status( + category_rerank={"enabled": True}, + corpus_probe={"match_count": 0}, + retrieval_trace_summary={"trace_present": True, "counts": {}, "rates": {}}, + ) + + assert evidence["status"] == "diagnostic" + assert "empty_corpus_probe" in evidence["reasons"] + + def test_probe_corpus_counts_aggregate_and_concrete_matches() -> None: class FakeClient: - def search(self, **_: object) -> SimpleNamespace: + def __init__(self) -> None: + self.limit: int | None = None + + def search(self, **kwargs: object) -> SimpleNamespace: + self.limit = int(kwargs["limit"]) return SimpleNamespace( memories=[ SimpleNamespace( @@ -277,15 +292,19 @@ def search(self, **_: object) -> SimpleNamespace: def read(self, uri: str) -> str: return f"body for {uri}" + client = FakeClient() probe = _probe_corpus( SimpleNamespace( + category_reranker=_reranker(), domain="airline", search_uri="viking://agent/a/memories/trajectories", retrieval_top_k=4, ), - FakeClient(), + client, ) + assert client.limit == 6 + assert probe["probe_limit"] == 6 assert probe["match_count"] == 2 assert probe["aggregate_match_count"] == 1 assert probe["concrete_match_count"] == 1 From 88505d7b02299c2e4bd552d21d55d92bd53ac9ae Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Thu, 14 May 2026 06:57:21 +0800 Subject: [PATCH 28/42] bench(tau2): align no-memory runner with PR-B --- benchmark/tau2/README.md | 5 ++ benchmark/tau2/config/no_memory.yaml | 9 +++ benchmark/tau2/scripts/run_eval.py | 73 +++++++++++------ benchmark/tau2/scripts/run_memory_v2_eval.py | 66 +++++++++++++-- tests/benchmark/test_tau2_category_rerank.py | 84 +++++++++++++++++++- 5 files changed, 205 insertions(+), 32 deletions(-) create mode 100644 benchmark/tau2/config/no_memory.yaml diff --git a/benchmark/tau2/README.md b/benchmark/tau2/README.md index ea1c32ffed..0103eb94ee 100644 --- a/benchmark/tau2/README.md +++ b/benchmark/tau2/README.md @@ -18,6 +18,7 @@ benchmark/tau2/ ├── config/ │ ├── baseline.yaml │ ├── category_rerank.yaml +│ ├── no_memory.yaml │ ├── official.yaml │ ├── prewrite.yaml │ └── trajectory.yaml @@ -55,6 +56,10 @@ Plan the default benchmark without running TAU-2: python benchmark/tau2/scripts/run_eval.py --config benchmark/tau2/config/baseline.yaml --plan-only ``` +Use `config/no_memory.yaml` for same-runner no-memory baselines; it executes +through the Python wrapper so artifacts and result validation match the memory +cells. + Add `--preflight` or `--strict-preflight` when you want the runner to write a small environment/config check next to the run plan. diff --git a/benchmark/tau2/config/no_memory.yaml b/benchmark/tau2/config/no_memory.yaml new file mode 100644 index 0000000000..93f35633b4 --- /dev/null +++ b/benchmark/tau2/config/no_memory.yaml @@ -0,0 +1,9 @@ +extends: baseline.yaml + +benchmark: + name: tau2_openviking_no_memory + +strategies: + - id: no_memory + label: TAU-2 no-memory baseline + memory_backend: none diff --git a/benchmark/tau2/scripts/run_eval.py b/benchmark/tau2/scripts/run_eval.py index 763c8c2ba2..ed26def21f 100755 --- a/benchmark/tau2/scripts/run_eval.py +++ b/benchmark/tau2/scripts/run_eval.py @@ -4,6 +4,7 @@ import argparse import importlib.util import json +import os import subprocess import sys from concurrent.futures import ThreadPoolExecutor, as_completed @@ -204,38 +205,44 @@ def _tau2_command( return None command = [ - tau2_cli(config), - "run", + sys.executable, + str(Path(__file__).with_name("run_memory_v2_eval.py")), + "--tau2-repo", + str(tau2_repo(config)), + "--run-dir", + str(output_dir(config, configured_run_id) / "memory_cells" / run_label), + "--run-label", + run_label, + "--strategy-id", + strategy["id"], "--domain", domain, - "--agent", - str(benchmark.get("agent", "llm_agent")), - "--user", - str(benchmark.get("user", "user_simulator")), - "--task-split-name", + "--eval-split-name", str(benchmark.get("eval_split_name", "test")), - "--num-trials", - "1", "--max-steps", str(benchmark.get("max_steps", 200)), "--max-concurrency", str(benchmark.get("task_max_concurrency", 10)), + "--base-agent", + str(benchmark.get("agent", "llm_agent")), + "--user", + str(benchmark.get("user", "user_simulator")), "--agent-llm", str(model["agent_llm"]), "--user-llm", str(model["user_llm"]), - "--save-to", - run_label, + "--agent-llm-args", + agent_llm_args, + "--user-llm-args", + user_llm_args, "--seed", str(seed), + "--no-memory", ] - command.extend(["--agent-llm-args", agent_llm_args]) - command.extend(["--user-llm-args", user_llm_args]) - if task_ids: - command.append("--task-ids") - command.extend(task_ids) + for task_id in task_ids: + command.extend(["--task-id", task_id]) elif num_tasks is not None: command.extend(["--num-tasks", str(num_tasks)]) @@ -336,24 +343,25 @@ def _build_plan( def _cell_artifacts(cell: dict[str, Any], repo: Path, out: Path) -> dict[str, str]: - if cell.get("memory_backend") == "openviking": + if cell.get("memory_backend") in {"openviking", "none"}: run_dir = out / "memory_cells" / cell["run_label"] - corpus_id = str(cell.get("corpus_id") or cell["strategy_id"]) - corpus_dir = out / "memory_corpora" / f"{cell['domain']}_{corpus_id}" - return { + artifacts = { "summary": str(run_dir / f"{cell['run_label']}.summary.json"), "results": str(run_dir / f"{cell['run_label']}.json"), - "retrieval_trace": str(run_dir / f"{cell['run_label']}.retrieval_trace.jsonl"), - "corpus_manifest": str(corpus_dir / "corpus_manifest.json"), } + if cell.get("memory_backend") == "none": + return artifacts + corpus_id = str(cell.get("corpus_id") or cell["strategy_id"]) + corpus_dir = out / "memory_corpora" / f"{cell['domain']}_{corpus_id}" + artifacts["retrieval_trace"] = str(run_dir / f"{cell['run_label']}.retrieval_trace.jsonl") + artifacts["corpus_manifest"] = str(corpus_dir / "corpus_manifest.json") + return artifacts return {"results": str(repo / "data" / "simulations" / f"{cell['run_label']}.json")} def _cell_metrics(cell: dict[str, Any], artifacts: dict[str, str]) -> dict[str, Any] | None: - if cell.get("memory_backend") == "openviking": - summary_path = Path(artifacts["summary"]) - if not summary_path.is_file(): - return None + summary_path = Path(artifacts.get("summary", "")) + if summary_path.is_file(): summary = json.loads(summary_path.read_text(encoding="utf-8")) return summary.get("metrics") @@ -390,6 +398,17 @@ def _memory_corpus_key(cell: dict[str, Any]) -> str: return f"{cell['domain']}_{corpus_id}" +def _tau2_subprocess_env(repo: Path) -> dict[str, str]: + env = os.environ.copy() + src = repo / "src" + pythonpath_entry = str(src if src.is_dir() else repo) + existing = env.get("PYTHONPATH") + env["PYTHONPATH"] = ( + pythonpath_entry if not existing else f"{pythonpath_entry}{os.pathsep}{existing}" + ) + return env + + def _prepare_memory_corpus(cell: dict[str, Any], repo: Path, out: Path) -> dict[str, Any]: key = _memory_corpus_key(cell) command = list(cell["command"]) + ["--prepare-corpus-only"] @@ -397,6 +416,7 @@ def _prepare_memory_corpus(cell: dict[str, Any], repo: Path, out: Path) -> dict[ completed = subprocess.run( command, cwd=repo, + env=_tau2_subprocess_env(repo), text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, @@ -532,6 +552,7 @@ def _execute_cells(plan: dict[str, Any], repo: Path, out: Path) -> list[dict[str completed = subprocess.run( cell["command"], cwd=repo, + env=_tau2_subprocess_env(repo), text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, diff --git a/benchmark/tau2/scripts/run_memory_v2_eval.py b/benchmark/tau2/scripts/run_memory_v2_eval.py index b06c9380aa..5096bb4237 100644 --- a/benchmark/tau2/scripts/run_memory_v2_eval.py +++ b/benchmark/tau2/scripts/run_memory_v2_eval.py @@ -959,13 +959,13 @@ def main() -> int: parser.add_argument("--user-llm", required=True) parser.add_argument("--agent-llm-args", type=_json, default={}) parser.add_argument("--user-llm-args", type=_json, default={}) - parser.add_argument("--openviking-url", required=True) - parser.add_argument("--openviking-account", required=True) - parser.add_argument("--openviking-user", required=True) - parser.add_argument("--openviking-agent-id", required=True) + parser.add_argument("--openviking-url") + parser.add_argument("--openviking-account") + parser.add_argument("--openviking-user") + parser.add_argument("--openviking-agent-id") parser.add_argument("--openviking-timeout", type=float, default=600.0) parser.add_argument("--openviking-wait-timeout", type=int, default=600) - parser.add_argument("--search-uri", required=True) + parser.add_argument("--search-uri") parser.add_argument("--retrieval-top-k", type=int, default=4) parser.add_argument( "--retrieval-mode", @@ -976,8 +976,30 @@ def main() -> int: parser.add_argument("--scope-prompt-config", type=_json, default={}) parser.add_argument("--force-train", action="store_true") parser.add_argument("--prepare-corpus-only", action="store_true") + parser.add_argument( + "--no-memory", + action="store_true", + help="Run the configured TAU-2 agent without OpenViking retrieval.", + ) args = parser.parse_args() normalize_litellm_env() + if not args.no_memory: + missing = [ + name + for name in ( + "openviking_url", + "openviking_account", + "openviking_user", + "openviking_agent_id", + "search_uri", + ) + if not getattr(args, name) + ] + if missing: + parser.error( + "OpenViking memory runs require: " + + ", ".join("--" + name.replace("_", "-") for name in missing) + ) args.category_reranker = CategoryReranker.from_payload( args.category_rerank_config, repo_root=REPO_ROOT, @@ -1001,6 +1023,40 @@ def main() -> int: trace_path = args.run_dir / f"{args.run_label}.retrieval_trace.jsonl" summary_path = args.run_dir / f"{args.run_label}.summary.json" + if args.no_memory: + _run_tau2( + tau2_repo=args.tau2_repo, + domain=args.domain, + split=args.eval_split_name, + task_ids=args.task_ids, + num_tasks=args.num_tasks, + trials=1, + max_steps=args.max_steps, + max_concurrency=args.max_concurrency, + agent=args.base_agent, + user=args.user, + agent_llm=args.agent_llm, + user_llm=args.user_llm, + agent_llm_args=args.agent_llm_args, + user_llm_args=args.user_llm_args, + seed=args.seed, + save_to=eval_results, + ) + assert_tau2_results_complete( + json.loads(eval_results.read_text()), context=f"{args.domain} eval" + ) + summary = { + "run_label": args.run_label, + "domain": args.domain, + "strategy_id": args.strategy_id, + "seed": args.seed, + "eval_results": str(eval_results), + "metrics": _metrics(eval_results), + } + _write_json(summary_path, summary) + print(json.dumps(summary, ensure_ascii=False, sort_keys=True)) + return 0 + corpus = _train(args, train_results, corpus_manifest) if args.prepare_corpus_only: print( diff --git a/tests/benchmark/test_tau2_category_rerank.py b/tests/benchmark/test_tau2_category_rerank.py index 79b2cbce6c..e54408427d 100644 --- a/tests/benchmark/test_tau2_category_rerank.py +++ b/tests/benchmark/test_tau2_category_rerank.py @@ -3,7 +3,12 @@ from types import SimpleNamespace from benchmark.tau2.scripts.category_rerank import CategoryReranker -from benchmark.tau2.scripts.run_eval import _summarize +from benchmark.tau2.scripts.run_eval import ( + _cell_artifacts, + _cell_metrics, + _summarize, + _tau2_command, +) from benchmark.tau2.scripts.run_memory_v2_eval import ( _load_scope_prompt, _probe_corpus, @@ -258,6 +263,83 @@ def test_scoreboard_excludes_diagnostic_runtime_evidence() -> None: assert domain["db_match_rate"] == 0.0 +def test_no_memory_strategy_uses_wrapper_command(tmp_path: Path) -> None: + config = { + "benchmark": { + "eval_split_name": "test", + "max_steps": 7, + "task_max_concurrency": 2, + "agent": "llm_agent", + "user": "user_simulator", + "reasoning_effort": "high", + }, + "model": { + "agent_llm": "agent-model", + "user_llm": "user-model", + }, + "paths": { + "tau2_repo": str(tmp_path / "tau2-bench"), + "output_dir": str(tmp_path / "result"), + }, + } + + command = _tau2_command( + config, + domain="airline", + strategy={"id": "no_memory", "memory_backend": "none"}, + configured_run_id="baseline_run", + run_label="baseline_run_airline_no_memory_r1", + task_ids=["18"], + num_tasks=None, + train_num_tasks=None, + seed=300, + ) + + assert command is not None + assert command[1].endswith("run_memory_v2_eval.py") + assert "--no-memory" in command + assert "--openviking-url" not in command + assert command[command.index("--strategy-id") + 1] == "no_memory" + assert command[command.index("--base-agent") + 1] == "llm_agent" + assert command[command.index("--task-id") + 1] == "18" + assert command[command.index("--run-dir") + 1].endswith( + "result/baseline_run/memory_cells/baseline_run_airline_no_memory_r1" + ) + + +def test_no_memory_artifacts_read_wrapper_summary_metrics(tmp_path: Path) -> None: + out = tmp_path / "out" + cell = { + "memory_backend": "none", + "domain": "airline", + "strategy_id": "no_memory", + "run_label": "baseline_run_airline_no_memory_r1", + } + + artifacts = _cell_artifacts(cell, repo=tmp_path / "tau2-bench", out=out) + assert set(artifacts) == {"summary", "results"} + + summary_path = Path(artifacts["summary"]) + summary_path.parent.mkdir(parents=True) + summary_path.write_text( + json.dumps( + { + "metrics": { + "simulation_count": 1, + "avg_reward": 0.0, + "db_match_rate": 0.0, + } + } + ) + ) + + assert _cell_metrics(cell, artifacts) == { + "simulation_count": 1, + "avg_reward": 0.0, + "db_match_rate": 0.0, + } + + def test_runtime_evidence_marks_empty_corpus_probe_diagnostic() -> None: evidence = _runtime_evidence_status( category_rerank={"enabled": True}, From 1977673c2503a8c90eed64a20501c8ef68a571cb Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Thu, 14 May 2026 07:16:59 +0800 Subject: [PATCH 29/42] test(tau2): guard S89 category alignment --- tests/benchmark/test_tau2_category_rerank.py | 48 ++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/tests/benchmark/test_tau2_category_rerank.py b/tests/benchmark/test_tau2_category_rerank.py index e54408427d..0cae91de52 100644 --- a/tests/benchmark/test_tau2_category_rerank.py +++ b/tests/benchmark/test_tau2_category_rerank.py @@ -9,6 +9,7 @@ _summarize, _tau2_command, ) +from benchmark.tau2.scripts.tau2_common import load_config from benchmark.tau2.scripts.run_memory_v2_eval import ( _load_scope_prompt, _probe_corpus, @@ -34,6 +35,53 @@ def _reranker() -> CategoryReranker: ) +def _has_key_fragment(value: object, fragment: str) -> bool: + if isinstance(value, dict): + return any( + fragment in str(key).lower() or _has_key_fragment(item, fragment) + for key, item in value.items() + ) + if isinstance(value, list): + return any(_has_key_fragment(item, fragment) for item in value) + return False + + +def test_category_rerank_config_matches_s89_alignment_shape() -> None: + repo_root = Path(__file__).resolve().parents[2] + config = load_config(repo_root / "benchmark/tau2/config/category_rerank.yaml") + strategies = {row["id"]: row for row in config["strategies"]} + category_strategy = strategies["memory_v2_trajectory_category_prewrite"] + + assert config["benchmark"]["reasoning_effort"] == "high" + assert config["openviking"]["retrieval_top_k"] == 4 + assert category_strategy["memory_backend"] == "openviking" + assert category_strategy["train_memory_mode"] == "experience_only" + assert category_strategy["search_memory_type"] == "trajectories" + assert category_strategy["retrieval_mode"] == "first_user_prewrite" + assert category_strategy["corpus_id"] == "memory_v2_trajectory_view" + + category_rerank = category_strategy["category_rerank"] + assert category_rerank["enabled"] is True + assert category_rerank["apply_nodes"] == ["before_write_tool_call"] + assert category_rerank["retrieve_limit"] == 6 + assert category_rerank["inject_limit"] == 2 + assert category_rerank["mismatch_policy"] == "keep_positive_match_drop_mismatch" + assert category_rerank["positive_match_required"] is True + assert category_rerank["no_match_policy"] == "skip_injection" + assert category_rerank["search_score_weight"] == 0.0 + + scope_prompt = category_strategy["scope_prompt"] + assert scope_prompt["enabled"] is True + assert scope_prompt["injection_point"] == "system_prompt" + assert scope_prompt["domain_files"] == { + "retail": "benchmark/tau2/config/scope_prompts/retail_same_order_variant_guard.md" + } + + assert "memory_v2_trajectory_prewrite" in strategies + assert not _has_key_fragment(category_strategy, "annotation") + assert not _has_key_fragment(category_strategy, "sidecar") + + def test_category_rerank_keeps_positive_category_match() -> None: rows = [ { From b4e153117b39c2491551f52c9f8aaaf3b5a8b78f Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Thu, 14 May 2026 07:46:53 +0800 Subject: [PATCH 30/42] bench(tau2): require applied category runtime evidence --- benchmark/tau2/README.md | 7 +- benchmark/tau2/scripts/run_memory_v2_eval.py | 12 ++- tests/benchmark/test_tau2_category_rerank.py | 81 ++++++++++++++++++++ 3 files changed, 96 insertions(+), 4 deletions(-) diff --git a/benchmark/tau2/README.md b/benchmark/tau2/README.md index 0103eb94ee..960e7d6603 100644 --- a/benchmark/tau2/README.md +++ b/benchmark/tau2/README.md @@ -179,9 +179,10 @@ category coverage, aggregate-vs-concrete memory candidate coverage, and write tool calls. Use it as the first check that a run is using this branch's self-generated category signal before opening the JSONL trace. Category runs whose runtime trace has only aggregate `.overview.md` / `.abstract.md` -candidates, no memory category coverage, or no selected positive category match -are marked `runtime_evidence.status=diagnostic`; `scoreboard.json` excludes -those diagnostic cells from the main reward/DB aggregates while preserving their +candidates, no applied category-rerank event, no query or memory category +coverage, or no selected positive category match are marked +`runtime_evidence.status=diagnostic`; `scoreboard.json` excludes those +diagnostic cells from the main reward/DB aggregates while preserving their metrics and artifacts for debugging. Corpus manifests also include `corpus_probe.aggregate_match_count` and `corpus_probe.concrete_match_count` so aggregate-only corpora can be spotted before reading the eval trace; category diff --git a/benchmark/tau2/scripts/run_memory_v2_eval.py b/benchmark/tau2/scripts/run_memory_v2_eval.py index 5096bb4237..0619c52efd 100644 --- a/benchmark/tau2/scripts/run_memory_v2_eval.py +++ b/benchmark/tau2/scripts/run_memory_v2_eval.py @@ -354,6 +354,8 @@ def _runtime_evidence_status( reasons: list[str] = [] if category_rerank.get("enabled"): corpus_probe = corpus_probe if isinstance(corpus_probe, dict) else {} + if not corpus_probe: + reasons.append("missing_corpus_probe") if corpus_probe and int(corpus_probe.get("match_count") or 0) <= 0: reasons.append("empty_corpus_probe") if int(corpus_probe.get("match_count") or 0) > 0: @@ -377,7 +379,15 @@ def _runtime_evidence_status( if isinstance(retrieval_trace_summary.get("rates"), dict) else {} ) - if int(counts.get("category_applied_event_count") or 0) > 0: + applied_count = int(counts.get("category_applied_event_count") or 0) + if retrieval_trace_summary.get("trace_present"): + if int(retrieval_trace_summary.get("category_event_count") or 0) <= 0: + reasons.append("no_category_rerank_events") + elif applied_count <= 0: + reasons.append("no_category_rerank_applied_events") + if applied_count > 0: + if int(counts.get("query_category_matched_event_count") or 0) <= 0: + reasons.append("no_query_category_coverage") if float(rates.get("concrete_memory_candidate_rate") or 0.0) <= 0.0: reasons.append("no_concrete_memory_candidates") if int(counts.get("memory_category_present_count") or 0) <= 0: diff --git a/tests/benchmark/test_tau2_category_rerank.py b/tests/benchmark/test_tau2_category_rerank.py index 0cae91de52..9b072f87f2 100644 --- a/tests/benchmark/test_tau2_category_rerank.py +++ b/tests/benchmark/test_tau2_category_rerank.py @@ -270,6 +270,87 @@ def test_runtime_evidence_marks_aggregate_only_category_diagnostic() -> None: assert "no_selected_positive_category_match" in evidence["reasons"] +def test_runtime_evidence_requires_applied_category_events() -> None: + evidence = _runtime_evidence_status( + category_rerank={"enabled": True}, + corpus_probe={ + "match_count": 1, + "aggregate_match_count": 0, + "concrete_match_count": 1, + }, + retrieval_trace_summary={ + "trace_present": True, + "category_event_count": 1, + "counts": { + "category_enabled_event_count": 1, + "category_applied_event_count": 0, + }, + "rates": { + "concrete_memory_candidate_rate": 1.0, + "selected_positive_category_match_rate": 1.0, + }, + }, + ) + + assert evidence["status"] == "diagnostic" + assert "no_category_rerank_applied_events" in evidence["reasons"] + + +def test_runtime_evidence_requires_query_category_coverage() -> None: + evidence = _runtime_evidence_status( + category_rerank={"enabled": True}, + corpus_probe={ + "match_count": 1, + "aggregate_match_count": 0, + "concrete_match_count": 1, + }, + retrieval_trace_summary={ + "trace_present": True, + "category_event_count": 1, + "counts": { + "category_applied_event_count": 1, + "query_category_matched_event_count": 0, + "memory_category_present_count": 1, + "memory_category_matched_count": 1, + }, + "rates": { + "concrete_memory_candidate_rate": 1.0, + "selected_positive_category_match_rate": 1.0, + }, + }, + ) + + assert evidence["status"] == "diagnostic" + assert "no_query_category_coverage" in evidence["reasons"] + + +def test_runtime_evidence_accepts_valid_category_runtime_coverage() -> None: + evidence = _runtime_evidence_status( + category_rerank={"enabled": True}, + corpus_probe={ + "match_count": 2, + "aggregate_match_count": 0, + "concrete_match_count": 2, + }, + retrieval_trace_summary={ + "trace_present": True, + "category_event_count": 1, + "counts": { + "category_applied_event_count": 1, + "query_category_matched_event_count": 1, + "memory_category_present_count": 2, + "memory_category_matched_count": 1, + }, + "rates": { + "concrete_memory_candidate_rate": 1.0, + "selected_positive_category_match_rate": 1.0, + }, + }, + ) + + assert evidence == {"status": "valid", "reasons": []} + + def test_scoreboard_excludes_diagnostic_runtime_evidence() -> None: scoreboard = _summarize( [ From 3be6924aa578ed57246376e1833e1cd2c93277c8 Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Thu, 14 May 2026 08:18:10 +0800 Subject: [PATCH 31/42] bench(tau2): summarize diagnostic evidence reasons --- benchmark/tau2/README.md | 3 ++- benchmark/tau2/scripts/run_eval.py | 11 +++++++++++ tests/benchmark/test_tau2_category_rerank.py | 9 ++++++++- 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/benchmark/tau2/README.md b/benchmark/tau2/README.md index 960e7d6603..038f033d17 100644 --- a/benchmark/tau2/README.md +++ b/benchmark/tau2/README.md @@ -183,7 +183,8 @@ candidates, no applied category-rerank event, no query or memory category coverage, or no selected positive category match are marked `runtime_evidence.status=diagnostic`; `scoreboard.json` excludes those diagnostic cells from the main reward/DB aggregates while preserving their -metrics and artifacts for debugging. Corpus manifests also include +metrics, artifacts, and `diagnostic_reason_counts` for debugging. Corpus +manifests also include `corpus_probe.aggregate_match_count` and `corpus_probe.concrete_match_count` so aggregate-only corpora can be spotted before reading the eval trace; category runs whose corpus probe is empty, or has matches but no concrete matches, are diff --git a/benchmark/tau2/scripts/run_eval.py b/benchmark/tau2/scripts/run_eval.py index ed26def21f..9517b8a44b 100755 --- a/benchmark/tau2/scripts/run_eval.py +++ b/benchmark/tau2/scripts/run_eval.py @@ -7,6 +7,7 @@ import os import subprocess import sys +from collections import Counter from concurrent.futures import ThreadPoolExecutor, as_completed from pathlib import Path from typing import Any @@ -473,6 +474,15 @@ def weighted(rows_for_group: list[dict[str, Any]]) -> dict[str, Any]: diagnostic_rows = [ row for row in metric_rows if not _row_is_valid_evidence(row) ] + diagnostic_reason_counts: Counter[str] = Counter() + for row in diagnostic_rows: + evidence = row.get("runtime_evidence") + evidence = evidence if isinstance(evidence, dict) else {} + reasons = list(evidence.get("reasons") or []) + if not reasons: + reasons = [str(evidence.get("status") or "diagnostic")] + for reason in reasons: + diagnostic_reason_counts[str(reason)] += 1 sim_count = sum(int(row["metrics"].get("simulation_count") or 0) for row in metric_rows) valid_sim_count = sum( int(row["metrics"].get("simulation_count") or 0) for row in valid_metric_rows @@ -501,6 +511,7 @@ def weighted(rows_for_group: list[dict[str, Any]]) -> dict[str, Any]: "completed_cell_count": len(metric_rows), "valid_completed_cell_count": len(valid_metric_rows), "diagnostic_cell_count": len(diagnostic_rows), + "diagnostic_reason_counts": dict(sorted(diagnostic_reason_counts.items())), "diagnostic_simulation_count": sim_count - valid_sim_count, "simulation_count": valid_sim_count, "avg_reward": reward_sum / valid_sim_count if valid_sim_count else None, diff --git a/tests/benchmark/test_tau2_category_rerank.py b/tests/benchmark/test_tau2_category_rerank.py index 9b072f87f2..6cded8cb3e 100644 --- a/tests/benchmark/test_tau2_category_rerank.py +++ b/tests/benchmark/test_tau2_category_rerank.py @@ -364,7 +364,10 @@ def test_scoreboard_excludes_diagnostic_runtime_evidence() -> None: }, "runtime_evidence": { "status": "diagnostic", - "reasons": ["no_concrete_memory_candidates"], + "reasons": [ + "no_concrete_memory_candidates", + "no_query_category_coverage", + ], }, }, { @@ -386,6 +389,10 @@ def test_scoreboard_excludes_diagnostic_runtime_evidence() -> None: assert domain["completed_cell_count"] == 2 assert domain["valid_completed_cell_count"] == 1 assert domain["diagnostic_cell_count"] == 1 + assert domain["diagnostic_reason_counts"] == { + "no_concrete_memory_candidates": 1, + "no_query_category_coverage": 1, + } assert domain["diagnostic_simulation_count"] == 1 assert domain["simulation_count"] == 1 assert domain["avg_reward"] == 0.5 From 7a8f078cb93526db5b047eccae4ee538d1368970 Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Thu, 14 May 2026 08:52:41 +0800 Subject: [PATCH 32/42] bench(tau2): distinguish category coverage from match --- benchmark/tau2/README.md | 13 +++++++------ benchmark/tau2/scripts/run_memory_v2_eval.py | 7 +++++-- tests/benchmark/test_tau2_category_rerank.py | 18 +++++++++++++++--- 3 files changed, 27 insertions(+), 11 deletions(-) diff --git a/benchmark/tau2/README.md b/benchmark/tau2/README.md index 038f033d17..6921bb30a0 100644 --- a/benchmark/tau2/README.md +++ b/benchmark/tau2/README.md @@ -175,12 +175,13 @@ categories, rerank reasons, selected rows, skipped rows, scope prompt metadata, and flat `*_category*_prompt` fields kept compatible with Harness diagnostics. Each run summary also includes `retrieval_trace_summary`, a compact rollup of decision nodes, category decisions, query/memory category sources, selected -category coverage, aggregate-vs-concrete memory candidate coverage, and write -tool calls. Use it as the first check that a run is using this branch's -self-generated category signal before opening the JSONL trace. Category runs -whose runtime trace has only aggregate `.overview.md` / `.abstract.md` -candidates, no applied category-rerank event, no query or memory category -coverage, or no selected positive category match are marked +category coverage, positive query-to-memory category-match coverage, +aggregate-vs-concrete memory candidate coverage, and write tool calls. Use it +as the first check that a run is using this branch's self-generated category +signal before opening the JSONL trace. Category runs whose runtime trace has +only aggregate `.overview.md` / `.abstract.md` candidates, no applied +category-rerank event, no query or memory category coverage, no positive +query-to-memory category match, or no selected positive category match are marked `runtime_evidence.status=diagnostic`; `scoreboard.json` excludes those diagnostic cells from the main reward/DB aggregates while preserving their metrics, artifacts, and `diagnostic_reason_counts` for debugging. Corpus diff --git a/benchmark/tau2/scripts/run_memory_v2_eval.py b/benchmark/tau2/scripts/run_memory_v2_eval.py index 0619c52efd..c9a7d0f5c2 100644 --- a/benchmark/tau2/scripts/run_memory_v2_eval.py +++ b/benchmark/tau2/scripts/run_memory_v2_eval.py @@ -272,19 +272,22 @@ def _trace_category_summary(trace_path: Path) -> dict[str, Any]: if selected: counters["selected_concrete_memory_count"] += 1 memory_source = match.get("memory_category_source_prompt") + positive_category_match = bool( + match.get("category1_match") or match.get("category2_match") + ) if memory_source: counters["memory_category_present_count"] += 1 memory_category_sources[str(memory_source)] += 1 if selected: counters["selected_memory_category_present_count"] += 1 selected_memory_category_sources[str(memory_source)] += 1 - if match.get("memory_category1_prompt") or match.get("memory_category2_prompt"): + if positive_category_match: counters["memory_category_matched_count"] += 1 if selected: counters["selected_memory_category_matched_count"] += 1 elif match.get("category_rerank_reasons") is not None: counters["memory_category_missing_count"] += 1 - if match.get("category1_match") or match.get("category2_match"): + if positive_category_match: counters["positive_category_match_count"] += 1 if selected: counters["selected_positive_category_match_count"] += 1 diff --git a/tests/benchmark/test_tau2_category_rerank.py b/tests/benchmark/test_tau2_category_rerank.py index 6cded8cb3e..8655a1ba31 100644 --- a/tests/benchmark/test_tau2_category_rerank.py +++ b/tests/benchmark/test_tau2_category_rerank.py @@ -215,6 +215,15 @@ def test_trace_category_summary_counts_runtime_sources(tmp_path: Path) -> None: "selected_for_injection": False, "category_rerank_reasons": ["missing_memory_category"], }, + { + "uri": "viking://agent/example/memories/trajectories/pending_cancel.md", + "selected_for_injection": False, + "memory_category_source_prompt": "tau2_category_catalog_keyword_match", + "memory_category1_prompt": ["retail_order_cancellation"], + "memory_category2_prompt": ["pending_order_cancel"], + "category1_match": False, + "category2_match": False, + }, ], }, ] @@ -230,12 +239,15 @@ def test_trace_category_summary_counts_runtime_sources(tmp_path: Path) -> None: assert summary["query_category_sources"]["tau2_category_catalog_keyword_match"] == 1 assert summary["selected_memory_category_sources"]["tau2_category_catalog_keyword_match"] == 1 assert summary["tool_calls"]["exchange_delivered_order_items"] == 1 + assert summary["rates"]["memory_category_candidate_coverage"] == 2 / 3 assert summary["rates"]["selected_memory_category_coverage"] == 1.0 - assert summary["rates"]["memory_category_match_coverage"] == 0.5 + assert summary["rates"]["memory_category_match_coverage"] == 1 / 3 assert summary["rates"]["selected_memory_category_match_coverage"] == 1.0 assert summary["counts"]["aggregate_memory_candidate_count"] == 1 - assert summary["counts"]["concrete_memory_candidate_count"] == 1 - assert summary["rates"]["concrete_memory_candidate_rate"] == 0.5 + assert summary["counts"]["concrete_memory_candidate_count"] == 2 + assert summary["counts"]["memory_category_present_count"] == 2 + assert summary["counts"]["memory_category_matched_count"] == 1 + assert summary["rates"]["concrete_memory_candidate_rate"] == 2 / 3 assert summary["rates"]["selected_concrete_memory_rate"] == 0.0 From c5ac25d789eba23b2ccad6bfdf051faea2745b5e Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Thu, 14 May 2026 09:51:22 +0800 Subject: [PATCH 33/42] bench(tau2): require concrete category injection evidence --- benchmark/tau2/README.md | 6 +- benchmark/tau2/scripts/run_memory_v2_eval.py | 43 ++++++++++ tests/benchmark/test_tau2_category_rerank.py | 84 +++++++++++++++++++- 3 files changed, 128 insertions(+), 5 deletions(-) diff --git a/benchmark/tau2/README.md b/benchmark/tau2/README.md index 6921bb30a0..8e8e0eb89a 100644 --- a/benchmark/tau2/README.md +++ b/benchmark/tau2/README.md @@ -181,10 +181,12 @@ as the first check that a run is using this branch's self-generated category signal before opening the JSONL trace. Category runs whose runtime trace has only aggregate `.overview.md` / `.abstract.md` candidates, no applied category-rerank event, no query or memory category coverage, no positive -query-to-memory category match, or no selected positive category match are marked +query-to-memory category match, no actual memory injection, no injected +concrete memory, or no selected positive category match are marked `runtime_evidence.status=diagnostic`; `scoreboard.json` excludes those diagnostic cells from the main reward/DB aggregates while preserving their -metrics, artifacts, and `diagnostic_reason_counts` for debugging. Corpus +metrics, artifacts, and +`diagnostic_reason_counts` for debugging. Corpus manifests also include `corpus_probe.aggregate_match_count` and `corpus_probe.concrete_match_count` so aggregate-only corpora can be spotted before reading the eval trace; category diff --git a/benchmark/tau2/scripts/run_memory_v2_eval.py b/benchmark/tau2/scripts/run_memory_v2_eval.py index c9a7d0f5c2..e02c994c2b 100644 --- a/benchmark/tau2/scripts/run_memory_v2_eval.py +++ b/benchmark/tau2/scripts/run_memory_v2_eval.py @@ -233,6 +233,12 @@ def _trace_category_summary(trace_path: Path) -> dict[str, Any]: continue decision_nodes[str(row.get("decision_node") or "unknown")] += 1 + injected_count = int(row.get("injected_count") or 0) + if str(row.get("retrieval_action_taken") or "") == "retrieve_and_inject" and ( + row.get("injected") or injected_count > 0 + ): + counters["memory_injection_event_count"] += 1 + counters["memory_injected_count"] += injected_count for call in row.get("tool_calls") or []: if isinstance(call, dict) and call.get("name"): tool_calls[str(call["name"])] += 1 @@ -261,8 +267,11 @@ def _trace_category_summary(trace_path: Path) -> dict[str, Any]: continue counters["raw_match_count"] += 1 selected = bool(match.get("selected_for_injection") or match.get("injected")) + injected = bool(match.get("injected")) if selected: counters["selected_match_count"] += 1 + if injected: + counters["injected_match_count"] += 1 if _is_aggregate_memory_uri(match.get("uri")): counters["aggregate_memory_candidate_count"] += 1 if selected: @@ -271,6 +280,8 @@ def _trace_category_summary(trace_path: Path) -> dict[str, Any]: counters["concrete_memory_candidate_count"] += 1 if selected: counters["selected_concrete_memory_count"] += 1 + if injected: + counters["injected_concrete_memory_count"] += 1 memory_source = match.get("memory_category_source_prompt") positive_category_match = bool( match.get("category1_match") or match.get("category2_match") @@ -291,16 +302,24 @@ def _trace_category_summary(trace_path: Path) -> dict[str, Any]: counters["positive_category_match_count"] += 1 if selected: counters["selected_positive_category_match_count"] += 1 + if injected: + counters["injected_positive_category_match_count"] += 1 raw_count = counters["raw_match_count"] selected_count = counters["selected_match_count"] + injected_count = counters["injected_match_count"] for key in [ "aggregate_memory_candidate_count", "concrete_memory_candidate_count", + "memory_injection_event_count", + "memory_injected_count", + "injected_match_count", "selected_aggregate_memory_count", "selected_concrete_memory_count", + "injected_concrete_memory_count", "memory_category_matched_count", "selected_memory_category_matched_count", + "injected_positive_category_match_count", ]: counters[key] += 0 return { @@ -336,6 +355,11 @@ def _trace_category_summary(trace_path: Path) -> dict[str, Any]: if selected_count else None ), + "injected_positive_category_match_rate": ( + counters["injected_positive_category_match_count"] / injected_count + if injected_count + else None + ), "concrete_memory_candidate_rate": ( counters["concrete_memory_candidate_count"] / raw_count if raw_count else None ), @@ -344,6 +368,11 @@ def _trace_category_summary(trace_path: Path) -> dict[str, Any]: if selected_count else None ), + "injected_concrete_memory_rate": ( + counters["injected_concrete_memory_count"] / injected_count + if injected_count + else None + ), }, } @@ -397,12 +426,26 @@ def _runtime_evidence_status( reasons.append("no_memory_category_coverage") if int(counts.get("memory_category_matched_count") or 0) <= 0: reasons.append("no_matched_memory_categories") + if int(counts.get("memory_injection_event_count") or 0) <= 0: + reasons.append("no_memory_injection") + if ( + int(counts.get("memory_injection_event_count") or 0) > 0 + and float(rates.get("injected_concrete_memory_rate") or 0.0) <= 0.0 + ): + reasons.append("no_injected_concrete_memory") if ( int(counts.get("query_category_matched_event_count") or 0) > 0 and float(rates.get("selected_positive_category_match_rate") or 0.0) <= 0.0 ): reasons.append("no_selected_positive_category_match") + if ( + int(counts.get("query_category_matched_event_count") or 0) > 0 + and int(counts.get("memory_injection_event_count") or 0) > 0 + and float(rates.get("injected_positive_category_match_rate") or 0.0) + <= 0.0 + ): + reasons.append("no_injected_positive_category_match") return { "status": "diagnostic" if reasons else "valid", diff --git a/tests/benchmark/test_tau2_category_rerank.py b/tests/benchmark/test_tau2_category_rerank.py index 8655a1ba31..1b209fc2bf 100644 --- a/tests/benchmark/test_tau2_category_rerank.py +++ b/tests/benchmark/test_tau2_category_rerank.py @@ -191,6 +191,8 @@ def test_trace_category_summary_counts_runtime_sources(tmp_path: Path) -> None: { "decision_node": "before_write_tool_call", "retrieval_action_taken": "retrieve_and_inject", + "injected": True, + "injected_count": 1, "tool_calls": [{"name": "exchange_delivered_order_items"}], "category_rerank": { "enabled": True, @@ -203,15 +205,16 @@ def test_trace_category_summary_counts_runtime_sources(tmp_path: Path) -> None: }, "matches": [ { - "uri": "viking://agent/example/memories/trajectories/.overview.md", + "uri": "viking://agent/example/memories/trajectories/delivered_exchange.md", "selected_for_injection": True, + "injected": True, "memory_category_source_prompt": "tau2_category_catalog_keyword_match", "memory_category1_prompt": ["retail_order_post_shipment_service_request"], "memory_category2_prompt": ["delivered_order_exchange"], "category2_match": True, }, { - "uri": "viking://agent/example/memories/trajectories/concrete.md", + "uri": "viking://agent/example/memories/trajectories/.overview.md", "selected_for_injection": False, "category_rerank_reasons": ["missing_memory_category"], }, @@ -245,10 +248,17 @@ def test_trace_category_summary_counts_runtime_sources(tmp_path: Path) -> None: assert summary["rates"]["selected_memory_category_match_coverage"] == 1.0 assert summary["counts"]["aggregate_memory_candidate_count"] == 1 assert summary["counts"]["concrete_memory_candidate_count"] == 2 + assert summary["counts"]["memory_injection_event_count"] == 1 + assert summary["counts"]["memory_injected_count"] == 1 + assert summary["counts"]["injected_match_count"] == 1 + assert summary["counts"]["injected_concrete_memory_count"] == 1 + assert summary["counts"]["injected_positive_category_match_count"] == 1 assert summary["counts"]["memory_category_present_count"] == 2 assert summary["counts"]["memory_category_matched_count"] == 1 assert summary["rates"]["concrete_memory_candidate_rate"] == 2 / 3 - assert summary["rates"]["selected_concrete_memory_rate"] == 0.0 + assert summary["rates"]["selected_concrete_memory_rate"] == 1.0 + assert summary["rates"]["injected_positive_category_match_rate"] == 1.0 + assert summary["rates"]["injected_concrete_memory_rate"] == 1.0 def test_runtime_evidence_marks_aggregate_only_category_diagnostic() -> None: @@ -279,6 +289,7 @@ def test_runtime_evidence_marks_aggregate_only_category_diagnostic() -> None: assert "no_concrete_corpus_probe_matches" in evidence["reasons"] assert "no_concrete_memory_candidates" in evidence["reasons"] assert "no_matched_memory_categories" in evidence["reasons"] + assert "no_memory_injection" in evidence["reasons"] assert "no_selected_positive_category_match" in evidence["reasons"] @@ -352,10 +363,15 @@ def test_runtime_evidence_accepts_valid_category_runtime_coverage() -> None: "query_category_matched_event_count": 1, "memory_category_present_count": 2, "memory_category_matched_count": 1, + "memory_injection_event_count": 1, + "injected_concrete_memory_count": 1, + "injected_positive_category_match_count": 1, }, "rates": { "concrete_memory_candidate_rate": 1.0, "selected_positive_category_match_rate": 1.0, + "injected_concrete_memory_rate": 1.0, + "injected_positive_category_match_rate": 1.0, }, }, ) @@ -363,6 +379,68 @@ def test_runtime_evidence_accepts_valid_category_runtime_coverage() -> None: assert evidence == {"status": "valid", "reasons": []} +def test_runtime_evidence_requires_actual_memory_injection() -> None: + evidence = _runtime_evidence_status( + category_rerank={"enabled": True}, + corpus_probe={ + "match_count": 1, + "aggregate_match_count": 0, + "concrete_match_count": 1, + }, + retrieval_trace_summary={ + "trace_present": True, + "category_event_count": 1, + "counts": { + "category_applied_event_count": 1, + "query_category_matched_event_count": 1, + "memory_category_present_count": 1, + "memory_category_matched_count": 1, + "memory_injection_event_count": 0, + }, + "rates": { + "concrete_memory_candidate_rate": 1.0, + "selected_positive_category_match_rate": 1.0, + }, + }, + ) + + assert evidence["status"] == "diagnostic" + assert "no_memory_injection" in evidence["reasons"] + + +def test_runtime_evidence_requires_injected_concrete_memory() -> None: + evidence = _runtime_evidence_status( + category_rerank={"enabled": True}, + corpus_probe={ + "match_count": 2, + "aggregate_match_count": 1, + "concrete_match_count": 1, + }, + retrieval_trace_summary={ + "trace_present": True, + "category_event_count": 1, + "counts": { + "category_applied_event_count": 1, + "query_category_matched_event_count": 1, + "memory_category_present_count": 1, + "memory_category_matched_count": 1, + "memory_injection_event_count": 1, + "injected_concrete_memory_count": 0, + "injected_positive_category_match_count": 1, + }, + "rates": { + "concrete_memory_candidate_rate": 0.5, + "selected_positive_category_match_rate": 1.0, + "injected_concrete_memory_rate": 0.0, + "injected_positive_category_match_rate": 1.0, + }, + }, + ) + + assert evidence["status"] == "diagnostic" + assert "no_injected_concrete_memory" in evidence["reasons"] + + def test_scoreboard_excludes_diagnostic_runtime_evidence() -> None: scoreboard = _summarize( [ From 7b6dc92dcd5314e1c33f1eba3065f5fe07bd8185 Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Thu, 14 May 2026 10:18:57 +0800 Subject: [PATCH 34/42] bench(tau2): require injected concrete category match --- benchmark/tau2/README.md | 8 ++--- benchmark/tau2/scripts/run_memory_v2_eval.py | 17 ++++++--- tests/benchmark/test_tau2_category_rerank.py | 38 ++++++++++++++++++++ 3 files changed, 55 insertions(+), 8 deletions(-) diff --git a/benchmark/tau2/README.md b/benchmark/tau2/README.md index 8e8e0eb89a..3525daa471 100644 --- a/benchmark/tau2/README.md +++ b/benchmark/tau2/README.md @@ -182,10 +182,10 @@ signal before opening the JSONL trace. Category runs whose runtime trace has only aggregate `.overview.md` / `.abstract.md` candidates, no applied category-rerank event, no query or memory category coverage, no positive query-to-memory category match, no actual memory injection, no injected -concrete memory, or no selected positive category match are marked -`runtime_evidence.status=diagnostic`; `scoreboard.json` excludes those -diagnostic cells from the main reward/DB aggregates while preserving their -metrics, artifacts, and +concrete memory, no injected concrete positive category match, or no selected +positive category match are marked `runtime_evidence.status=diagnostic`; +`scoreboard.json` excludes those diagnostic cells from the main reward/DB +aggregates while preserving their metrics, artifacts, and `diagnostic_reason_counts` for debugging. Corpus manifests also include `corpus_probe.aggregate_match_count` and `corpus_probe.concrete_match_count` so diff --git a/benchmark/tau2/scripts/run_memory_v2_eval.py b/benchmark/tau2/scripts/run_memory_v2_eval.py index e02c994c2b..1de72cc067 100644 --- a/benchmark/tau2/scripts/run_memory_v2_eval.py +++ b/benchmark/tau2/scripts/run_memory_v2_eval.py @@ -272,7 +272,8 @@ def _trace_category_summary(trace_path: Path) -> dict[str, Any]: counters["selected_match_count"] += 1 if injected: counters["injected_match_count"] += 1 - if _is_aggregate_memory_uri(match.get("uri")): + is_aggregate = _is_aggregate_memory_uri(match.get("uri")) + if is_aggregate: counters["aggregate_memory_candidate_count"] += 1 if selected: counters["selected_aggregate_memory_count"] += 1 @@ -304,6 +305,8 @@ def _trace_category_summary(trace_path: Path) -> dict[str, Any]: counters["selected_positive_category_match_count"] += 1 if injected: counters["injected_positive_category_match_count"] += 1 + if not is_aggregate: + counters["injected_concrete_positive_category_match_count"] += 1 raw_count = counters["raw_match_count"] selected_count = counters["selected_match_count"] @@ -320,6 +323,7 @@ def _trace_category_summary(trace_path: Path) -> dict[str, Any]: "memory_category_matched_count", "selected_memory_category_matched_count", "injected_positive_category_match_count", + "injected_concrete_positive_category_match_count", ]: counters[key] += 0 return { @@ -360,6 +364,11 @@ def _trace_category_summary(trace_path: Path) -> dict[str, Any]: if injected_count else None ), + "injected_concrete_positive_category_match_rate": ( + counters["injected_concrete_positive_category_match_count"] / injected_count + if injected_count + else None + ), "concrete_memory_candidate_rate": ( counters["concrete_memory_candidate_count"] / raw_count if raw_count else None ), @@ -442,10 +451,10 @@ def _runtime_evidence_status( if ( int(counts.get("query_category_matched_event_count") or 0) > 0 and int(counts.get("memory_injection_event_count") or 0) > 0 - and float(rates.get("injected_positive_category_match_rate") or 0.0) - <= 0.0 + and int(counts.get("injected_concrete_positive_category_match_count") or 0) + <= 0 ): - reasons.append("no_injected_positive_category_match") + reasons.append("no_injected_concrete_positive_category_match") return { "status": "diagnostic" if reasons else "valid", diff --git a/tests/benchmark/test_tau2_category_rerank.py b/tests/benchmark/test_tau2_category_rerank.py index 1b209fc2bf..052a7d639c 100644 --- a/tests/benchmark/test_tau2_category_rerank.py +++ b/tests/benchmark/test_tau2_category_rerank.py @@ -253,11 +253,13 @@ def test_trace_category_summary_counts_runtime_sources(tmp_path: Path) -> None: assert summary["counts"]["injected_match_count"] == 1 assert summary["counts"]["injected_concrete_memory_count"] == 1 assert summary["counts"]["injected_positive_category_match_count"] == 1 + assert summary["counts"]["injected_concrete_positive_category_match_count"] == 1 assert summary["counts"]["memory_category_present_count"] == 2 assert summary["counts"]["memory_category_matched_count"] == 1 assert summary["rates"]["concrete_memory_candidate_rate"] == 2 / 3 assert summary["rates"]["selected_concrete_memory_rate"] == 1.0 assert summary["rates"]["injected_positive_category_match_rate"] == 1.0 + assert summary["rates"]["injected_concrete_positive_category_match_rate"] == 1.0 assert summary["rates"]["injected_concrete_memory_rate"] == 1.0 @@ -366,6 +368,7 @@ def test_runtime_evidence_accepts_valid_category_runtime_coverage() -> None: "memory_injection_event_count": 1, "injected_concrete_memory_count": 1, "injected_positive_category_match_count": 1, + "injected_concrete_positive_category_match_count": 1, }, "rates": { "concrete_memory_candidate_rate": 1.0, @@ -441,6 +444,41 @@ def test_runtime_evidence_requires_injected_concrete_memory() -> None: assert "no_injected_concrete_memory" in evidence["reasons"] +def test_runtime_evidence_requires_injected_concrete_positive_category_match() -> None: + evidence = _runtime_evidence_status( + category_rerank={"enabled": True}, + corpus_probe={ + "match_count": 2, + "aggregate_match_count": 1, + "concrete_match_count": 1, + }, + retrieval_trace_summary={ + "trace_present": True, + "category_event_count": 1, + "counts": { + "category_applied_event_count": 1, + "query_category_matched_event_count": 1, + "memory_category_present_count": 1, + "memory_category_matched_count": 1, + "memory_injection_event_count": 2, + "injected_concrete_memory_count": 1, + "injected_positive_category_match_count": 1, + "injected_concrete_positive_category_match_count": 0, + }, + "rates": { + "concrete_memory_candidate_rate": 0.5, + "selected_positive_category_match_rate": 1.0, + "injected_concrete_memory_rate": 0.5, + "injected_positive_category_match_rate": 0.5, + "injected_concrete_positive_category_match_rate": 0.0, + }, + }, + ) + + assert evidence["status"] == "diagnostic" + assert "no_injected_concrete_positive_category_match" in evidence["reasons"] + + def test_scoreboard_excludes_diagnostic_runtime_evidence() -> None: scoreboard = _summarize( [ From 662cf0b387e7ba9b64eca6794368202215d18040 Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Thu, 14 May 2026 11:25:12 +0800 Subject: [PATCH 35/42] bench(tau2): align retrieval budget and fixed first user --- benchmark/tau2/README.md | 12 +- benchmark/tau2/config/baseline.yaml | 7 ++ benchmark/tau2/scripts/run_eval.py | 93 +++++++++++++- benchmark/tau2/scripts/run_memory_v2_eval.py | 123 +++++++++++++++++-- 4 files changed, 224 insertions(+), 11 deletions(-) diff --git a/benchmark/tau2/README.md b/benchmark/tau2/README.md index 6c542d9bcc..4bdc566bf1 100644 --- a/benchmark/tau2/README.md +++ b/benchmark/tau2/README.md @@ -128,7 +128,9 @@ Memory V2 cells run through a small TAU-2 agent adapter in this directory: - train by writing TAU-2 training conversations into OpenViking sessions; - evaluate by retrieving OpenViking memory at the first user turn; - for pre-write recall, retrieve again before write-like tool calls and - regenerate that step with the matched memories; + regenerate that step with the matched memories. The default benchmark + retrieves 6 pre-write candidates and injects 2, which keeps extra candidates + visible in traces without expanding the prompt budget; - emit artifact metadata to identify the OpenViking account, agent, corpus, retrieval mode, and simulator policy used by each cell. @@ -153,6 +155,14 @@ confirmation boundary to the TAU-2 user simulator guidelines; metadata such as the upstream PR link is kept in run artifacts, not in the simulator prompt. Reference: [sierra-research/tau2-bench#297](https://github.com/sierra-research/tau2-bench/pull/297). +Optional fixed-first-user fixtures keep the first simulated user turn stable +while preserving live simulator behavior after that turn: + +```bash +export TAU2_RETAIL_FIXED_FIRST_USER_FILE=/path/to/retail_fixture.json +export TAU2_AIRLINE_FIXED_FIRST_USER_FILE=/path/to/airline_fixture.json +``` + Use `config/official.yaml` with a clean TAU-2 checkout when you need an official-user-simulator parity run. If the checkout was already patched, the artifact records that boundary instead of labeling the run pure official. diff --git a/benchmark/tau2/config/baseline.yaml b/benchmark/tau2/config/baseline.yaml index 95541dcbd6..00ec9dbd9c 100644 --- a/benchmark/tau2/config/baseline.yaml +++ b/benchmark/tau2/config/baseline.yaml @@ -24,6 +24,11 @@ eval: # memory benchmark config opts into a confirmation-aware TAU-2 user simulator # prompt; run_eval.py applies that small prompt patch idempotently when needed. user_simulator_policy: confirmation_aware + # Optional fixed-first-user fixtures keep the first simulated user turn stable + # while leaving later turns live. Set these env vars to fixture JSON files. + fixed_first_user_fixtures: + retail: ${TAU2_RETAIL_FIXED_FIRST_USER_FILE:-} + airline: ${TAU2_AIRLINE_FIXED_FIRST_USER_FILE:-} model: agent_llm: ${TAU2_AGENT_LLM:-openai/doubao-seed-2-0-pro-260215} @@ -35,6 +40,8 @@ openviking: account: ${OPENVIKING_ACCOUNT:-default} agent_id: ${OPENVIKING_AGENT_ID:-tau2-openviking-agent} retrieval_top_k: 4 + prewrite_retrieval_top_k: 6 + prewrite_inject_top_k: 2 replay_write_policy: read_only strategies: diff --git a/benchmark/tau2/scripts/run_eval.py b/benchmark/tau2/scripts/run_eval.py index 715c0d2556..5bd3037a03 100755 --- a/benchmark/tau2/scripts/run_eval.py +++ b/benchmark/tau2/scripts/run_eval.py @@ -18,6 +18,7 @@ normalize_litellm_env, output_dir, run_id, + resolve_path, simulator_policy_report, split_file, strategy_ids, @@ -49,6 +50,66 @@ def _db_match(sim: dict[str, Any]) -> bool | None: return sim.get("db_match") +def _strategy_int( + config: dict[str, Any], + strategy: dict[str, Any], + key: str, + *, + fallback_key: str | None = None, + default: int = 4, +) -> int: + openviking = config.get("openviking", {}) + value = strategy.get(key) + if value is None: + value = openviking.get(key) + if value is None and fallback_key: + value = strategy.get(fallback_key) + if value is None and fallback_key: + value = openviking.get(fallback_key) + if value is None: + value = default + return int(value) + + +def _retrieval_budget(config: dict[str, Any], strategy: dict[str, Any]) -> dict[str, int]: + retrieval_top_k = _strategy_int(config, strategy, "retrieval_top_k", default=4) + first_user_retrieval_top_k = _strategy_int( + config, + strategy, + "first_user_retrieval_top_k", + fallback_key="retrieval_top_k", + default=retrieval_top_k, + ) + first_user_inject_top_k = _strategy_int( + config, + strategy, + "first_user_inject_top_k", + fallback_key="first_user_retrieval_top_k", + default=first_user_retrieval_top_k, + ) + prewrite_retrieval_top_k = _strategy_int( + config, + strategy, + "prewrite_retrieval_top_k", + fallback_key="retrieval_top_k", + default=retrieval_top_k, + ) + prewrite_inject_top_k = _strategy_int( + config, + strategy, + "prewrite_inject_top_k", + fallback_key="prewrite_retrieval_top_k", + default=prewrite_retrieval_top_k, + ) + return { + "retrieval_top_k": retrieval_top_k, + "first_user_retrieval_top_k": first_user_retrieval_top_k, + "first_user_inject_top_k": first_user_inject_top_k, + "prewrite_retrieval_top_k": prewrite_retrieval_top_k, + "prewrite_inject_top_k": prewrite_inject_top_k, + } + + def _metrics_from_tau2_results(results_path: Path) -> dict[str, Any]: data = json.loads(results_path.read_text(encoding="utf-8")) assert_tau2_results_complete(data, context=str(results_path)) @@ -86,6 +147,7 @@ def _tau2_command( if reasoning_effort: agent_llm_args = f'{{"temperature":0.0,"reasoning_effort":"{reasoning_effort}"}}' user_llm_args = f'{{"temperature":0.0,"reasoning_effort":"{reasoning_effort}"}}' + fixed_first_user_file = _fixed_first_user_file(config, domain) if ( strategy.get("memory_backend") == "openviking" @@ -102,6 +164,7 @@ def _tau2_command( f"Unsupported search_memory_type for {strategy['id']}: {search_memory_type}" ) search_uri = f"viking://agent/{agent_id}/memories/{search_memory_type}" + budget = _retrieval_budget(config, strategy) command = [ sys.executable, str(Path(__file__).with_name("run_memory_v2_eval.py")), @@ -144,12 +207,22 @@ def _tau2_command( "--search-uri", search_uri, "--retrieval-top-k", - str(openviking.get("retrieval_top_k", 4)), + str(budget["retrieval_top_k"]), + "--first-user-retrieval-top-k", + str(budget["first_user_retrieval_top_k"]), + "--first-user-inject-top-k", + str(budget["first_user_inject_top_k"]), + "--prewrite-retrieval-top-k", + str(budget["prewrite_retrieval_top_k"]), + "--prewrite-inject-top-k", + str(budget["prewrite_inject_top_k"]), "--retrieval-mode", str(strategy.get("retrieval_mode", "first_user")), "--seed", str(seed), ] + if fixed_first_user_file is not None: + command.extend(["--fixed-first-user-file", str(fixed_first_user_file)]) if task_ids: for task_id in task_ids: command.extend(["--task-id", task_id]) @@ -200,6 +273,8 @@ def _tau2_command( str(seed), "--no-memory", ] + if fixed_first_user_file is not None: + command.extend(["--fixed-first-user-file", str(fixed_first_user_file)]) if task_ids: for task_id in task_ids: @@ -210,6 +285,17 @@ def _tau2_command( return command +def _fixed_first_user_file(config: dict[str, Any], domain: str) -> Path | None: + raw = config.get("eval", {}).get("fixed_first_user_fixture") + if raw is None: + raw = config.get("eval", {}).get("fixed_first_user_fixtures") + if isinstance(raw, dict): + raw = raw.get(domain) or raw.get("default") + if raw is None or str(raw).strip() == "": + return None + return resolve_path(str(raw)) + + def _build_plan( config: dict[str, Any], configured_run_id: str, @@ -256,6 +342,7 @@ def _build_plan( train_num_tasks=train_num_tasks, seed=seed, ) + fixed_first_user_file = _fixed_first_user_file(config, domain) non_executable_reason = None if command is None: non_executable_reason = ( @@ -274,11 +361,15 @@ def _build_plan( "memory_backend": strategy.get("memory_backend"), "corpus_id": strategy.get("corpus_id", strategy["id"]), "retrieval_mode": strategy.get("retrieval_mode"), + "retrieval_budget": _retrieval_budget(config, strategy), "search_memory_type": strategy.get("search_memory_type", "experiences"), "adapter_status": strategy.get("adapter_status", "ready"), "executable": command is not None, "user_simulator_policy": user_simulator_policy(config), "user_simulator_policy_supported": policy_report["supported"], + "fixed_first_user_file": str(fixed_first_user_file) + if fixed_first_user_file + else None, "split_file": str(split_path), "command": command, "non_executable_reason": non_executable_reason, diff --git a/benchmark/tau2/scripts/run_memory_v2_eval.py b/benchmark/tau2/scripts/run_memory_v2_eval.py index f8835aace6..5b0cd62003 100644 --- a/benchmark/tau2/scripts/run_memory_v2_eval.py +++ b/benchmark/tau2/scripts/run_memory_v2_eval.py @@ -2,6 +2,7 @@ from __future__ import annotations import argparse +import hashlib import importlib import json import shutil @@ -30,6 +31,7 @@ "grant_", "reboot_", ) +FIXED_FIRST_USER_NAME = "openviking_fixed_first_user_simulator" def _json(text: str) -> dict[str, Any]: @@ -177,6 +179,68 @@ def _message_text(message: dict[str, Any]) -> tuple[str, str]: return "assistant", str(message.get("content") or "") +def _scenario_sha256(instructions: str) -> str: + return hashlib.sha256(instructions.encode("utf-8")).hexdigest() + + +def _load_fixed_first_user_fixture(path: Path) -> dict[str, str]: + if not path.is_file(): + raise FileNotFoundError(f"fixed-first-user fixture not found: {path}") + data = json.loads(path.read_text(encoding="utf-8")) + mapping = data.get("by_scenario_sha256") if isinstance(data, dict) else None + if not isinstance(mapping, dict) or not mapping: + raise ValueError(f"fixed-first-user fixture has no by_scenario_sha256 map: {path}") + return {str(key): str(value) for key, value in mapping.items()} + + +def _has_user_message(state: Any) -> bool: + for message in getattr(state, "messages", []) or []: + role = getattr(message, "role", None) + if str(getattr(role, "value", role)) == "user": + return True + return False + + +def _append_incoming_user_context(message: Any, state: Any) -> None: + from tau2.data_model.message import AssistantMessage, MultiToolMessage, ToolMessage + + if isinstance(message, MultiToolMessage): + state.messages.extend(message.tool_messages) + elif isinstance(message, ToolMessage): + state.messages.append(message) + elif isinstance(message, AssistantMessage) and (message.has_content() or message.is_tool_call()): + state.messages.append(message) + + +def _register_fixed_first_user(args: argparse.Namespace) -> str: + if not args.fixed_first_user_file: + return args.user + _add_tau2_to_path(args.tau2_repo) + mapping = _load_fixed_first_user_fixture(args.fixed_first_user_file) + + from tau2.data_model.message import UserMessage + from tau2.registry import registry + from tau2.user.user_simulator import UserSimulator + + class FixedFirstUserSimulator(UserSimulator): # type: ignore[misc] + def _generate_next_message(self, message: Any, state: Any) -> UserMessage: # type: ignore[override] + if not _has_user_message(state): + key = _scenario_sha256(str(self.instructions or "")) + fixed = mapping.get(key) + if fixed is None: + raise RuntimeError( + "fixed-first-user fixture does not cover this TAU-2 scenario: " + f"sha256={key}" + ) + _append_incoming_user_context(message, state) + return UserMessage(role="user", content=fixed) + return super()._generate_next_message(message, state) + + if FIXED_FIRST_USER_NAME not in registry.get_users(): + registry.register_user(FixedFirstUserSimulator, FIXED_FIRST_USER_NAME) + return FIXED_FIRST_USER_NAME + + def _run_tau2( *, tau2_repo: Path, @@ -399,28 +463,32 @@ def get_init_state(self, message_history=None): ) return state - def _retrieve(self, query: str) -> tuple[str, list[dict[str, Any]]]: + def _retrieve( + self, query: str, *, search_limit: int, inject_limit: int + ) -> tuple[str, list[dict[str, Any]]]: client = _client(args) rows: list[dict[str, Any]] = [] try: result = client.search( - query=query, target_uri=args.search_uri, limit=args.retrieval_top_k + query=query, target_uri=args.search_uri, limit=search_limit ) memories = list(getattr(result, "memories", []) or []) blocks = [] - for index, match in enumerate(memories[: args.retrieval_top_k], 1): + for index, match in enumerate(memories[:search_limit], 1): uri = getattr(match, "uri", "") text, read_error = _read_memory_text(client, match) + injected = index <= inject_limit and bool(text.strip()) row = { "uri": uri, "score": getattr(match, "score", None), "level": getattr(match, "level", None), "text_chars": len(text), + "injected": injected, } if read_error: row["read_error"] = read_error rows.append(row) - if text.strip(): + if injected: blocks.append(f"Memory {index} ({uri}):\n{text.strip()}") return "\n\n".join(blocks), rows finally: @@ -432,7 +500,7 @@ def _trace(self, event: dict[str, Any]) -> None: @staticmethod def _trace_injection_fields(block: str, matches: list[dict[str, Any]]) -> dict[str, Any]: - injected_count = sum(1 for row in matches if int(row.get("text_chars") or 0) > 0) + injected_count = sum(1 for row in matches if row.get("injected")) return { "injected": bool(block.strip()), "injected_count": injected_count if block.strip() else 0, @@ -519,7 +587,11 @@ def generate_next_message(self, message, state: LLMAgentState): role_value = getattr(role, "value", role) if marker_index is not None and str(role_value) == "user": query = str(getattr(message, "content", "") or "") - block, matches = self._retrieve(query) + block, matches = self._retrieve( + query, + search_limit=args.first_user_retrieval_top_k, + inject_limit=args.first_user_inject_top_k, + ) prompt = ( "No OpenViking memory matched this user request." if not block @@ -531,6 +603,8 @@ def generate_next_message(self, message, state: LLMAgentState): { "decision_node": "first_user", "query": query, + "search_limit": args.first_user_retrieval_top_k, + "inject_limit": args.first_user_inject_top_k, "match_count": len(matches), "matches": matches, **self._trace_injection_fields(block, matches), @@ -543,11 +617,17 @@ def generate_next_message(self, message, state: LLMAgentState): write_calls = [call for call in tool_calls if _is_write_tool_call(call)] if write_calls: query = _tool_call_query(write_calls, state.messages) - block, matches = self._retrieve(query) + block, matches = self._retrieve( + query, + search_limit=args.prewrite_retrieval_top_k, + inject_limit=args.prewrite_inject_top_k, + ) self._trace( { "decision_node": "before_write_tool_call", "query": query, + "search_limit": args.prewrite_retrieval_top_k, + "inject_limit": args.prewrite_inject_top_k, "match_count": len(matches), "matches": matches, **self._trace_injection_fields(block, matches), @@ -620,6 +700,11 @@ def main() -> int: parser.add_argument("--openviking-wait-timeout", type=int, default=600) parser.add_argument("--search-uri") parser.add_argument("--retrieval-top-k", type=int, default=4) + parser.add_argument("--first-user-retrieval-top-k", type=int) + parser.add_argument("--first-user-inject-top-k", type=int) + parser.add_argument("--prewrite-retrieval-top-k", type=int) + parser.add_argument("--prewrite-inject-top-k", type=int) + parser.add_argument("--fixed-first-user-file", type=Path) parser.add_argument( "--retrieval-mode", choices=["first_user", "prewrite", "first_user_prewrite"], @@ -659,6 +744,12 @@ def main() -> int: args.run_dir.mkdir(parents=True, exist_ok=True) corpus_dir = args.corpus_dir or args.run_dir corpus_dir.mkdir(parents=True, exist_ok=True) + args.first_user_retrieval_top_k = args.first_user_retrieval_top_k or args.retrieval_top_k + args.first_user_inject_top_k = args.first_user_inject_top_k or args.first_user_retrieval_top_k + args.prewrite_retrieval_top_k = args.prewrite_retrieval_top_k or args.retrieval_top_k + args.prewrite_inject_top_k = args.prewrite_inject_top_k or args.prewrite_retrieval_top_k + if args.fixed_first_user_file is not None: + args.fixed_first_user_file = args.fixed_first_user_file.expanduser().resolve() train_results = corpus_dir / "train_results.json" corpus_manifest = corpus_dir / "corpus_manifest.json" eval_results = args.run_dir / f"{args.run_label}.json" @@ -666,6 +757,7 @@ def main() -> int: summary_path = args.run_dir / f"{args.run_label}.summary.json" if args.no_memory: + user_name = _register_fixed_first_user(args) _run_tau2( tau2_repo=args.tau2_repo, domain=args.domain, @@ -676,7 +768,7 @@ def main() -> int: max_steps=args.max_steps, max_concurrency=args.max_concurrency, agent=args.base_agent, - user=args.user, + user=user_name, agent_llm=args.agent_llm, user_llm=args.user_llm, agent_llm_args=args.agent_llm_args, @@ -692,6 +784,9 @@ def main() -> int: "domain": args.domain, "strategy_id": args.strategy_id, "seed": args.seed, + "fixed_first_user_file": str(args.fixed_first_user_file) + if args.fixed_first_user_file + else None, "eval_results": str(eval_results), "metrics": _metrics(eval_results), } @@ -718,6 +813,7 @@ def main() -> int: trace_path.touch() _register_memory_agent(args, trace_path) + user_name = _register_fixed_first_user(args) _run_tau2( tau2_repo=args.tau2_repo, domain=args.domain, @@ -728,7 +824,7 @@ def main() -> int: max_steps=args.max_steps, max_concurrency=args.max_concurrency, agent=AGENT_NAME, - user=args.user, + user=user_name, agent_llm=args.agent_llm, user_llm=args.user_llm, agent_llm_args=args.agent_llm_args, @@ -744,7 +840,16 @@ def main() -> int: "domain": args.domain, "strategy_id": args.strategy_id, "retrieval_mode": args.retrieval_mode, + "retrieval": { + "first_user_retrieval_top_k": args.first_user_retrieval_top_k, + "first_user_inject_top_k": args.first_user_inject_top_k, + "prewrite_retrieval_top_k": args.prewrite_retrieval_top_k, + "prewrite_inject_top_k": args.prewrite_inject_top_k, + }, "seed": args.seed, + "fixed_first_user_file": str(args.fixed_first_user_file) + if args.fixed_first_user_file + else None, "corpus": corpus, "eval_results": str(eval_results), "retrieval_trace": str(trace_path), From f85d60bf22bc8e73f14e44775d90f2f0858ed477 Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Thu, 14 May 2026 12:15:38 +0800 Subject: [PATCH 36/42] bench(tau2): reuse memory corpora across eval runs --- benchmark/tau2/README.md | 11 ++- benchmark/tau2/config/baseline.yaml | 4 + benchmark/tau2/scripts/run_eval.py | 135 ++++++++++++++++++++++++---- 3 files changed, 127 insertions(+), 23 deletions(-) diff --git a/benchmark/tau2/README.md b/benchmark/tau2/README.md index 4bdc566bf1..998c93d440 100644 --- a/benchmark/tau2/README.md +++ b/benchmark/tau2/README.md @@ -25,7 +25,9 @@ benchmark/tau2/ └── run_full_eval.sh ``` -Generated artifacts are written to `benchmark/tau2/result//`. +Generated eval artifacts are written to `benchmark/tau2/result//`. +Memory corpus artifacts are cached outside the run id at +`benchmark/tau2/result/memory_corpora/` by default. ## Quick Start @@ -138,9 +140,10 @@ The existing `train_memory_mode: experience_only` value selects the Memory V2 session-commit path. `search_memory_type` selects which generated memory bucket is retrieved during eval (`experiences` by default, `trajectories` for `config/trajectory.yaml`). The runner prepares each distinct -`domain + corpus_id` once before executing eval cells. Different corpora may be -prepared in parallel with `benchmark.corpus_prepare_concurrency`; session -commits inside one corpus remain serial to preserve OpenViking write semantics. +`domain + corpus_id` once and reuses it across eval run ids when the cached +`corpus_manifest.json` is present. Different corpora may be prepared in +parallel with `benchmark.corpus_prepare_concurrency`; session commits inside one +corpus remain serial to preserve OpenViking write semantics. ## User Simulator Policy diff --git a/benchmark/tau2/config/baseline.yaml b/benchmark/tau2/config/baseline.yaml index 00ec9dbd9c..22cf4f4ecc 100644 --- a/benchmark/tau2/config/baseline.yaml +++ b/benchmark/tau2/config/baseline.yaml @@ -18,6 +18,9 @@ paths: tau2_repo: ${TAU2_REPO:-data/external_benchmarks/tau2-bench} tau2_cli: ${TAU2_CLI:-tau2} output_dir: benchmark/tau2/result + # Corpus writes are expensive and should be reused across eval run ids when + # the train split and memory prompt/config did not change. + corpus_cache_dir: benchmark/tau2/result/memory_corpora eval: # The runner default is official if this field is omitted. The OpenViking @@ -39,6 +42,7 @@ openviking: url: ${OPENVIKING_URL:-http://localhost:1933} account: ${OPENVIKING_ACCOUNT:-default} agent_id: ${OPENVIKING_AGENT_ID:-tau2-openviking-agent} + reuse_corpus_across_runs: true retrieval_top_k: 4 prewrite_retrieval_top_k: 6 prewrite_inject_top_k: 2 diff --git a/benchmark/tau2/scripts/run_eval.py b/benchmark/tau2/scripts/run_eval.py index 5bd3037a03..0afa2dfef8 100755 --- a/benchmark/tau2/scripts/run_eval.py +++ b/benchmark/tau2/scripts/run_eval.py @@ -110,6 +110,49 @@ def _retrieval_budget(config: dict[str, Any], strategy: dict[str, Any]) -> dict[ } +def _memory_corpus_key_for( + *, + domain: str, + strategy: dict[str, Any], + train_num_tasks: int | None, +) -> str: + corpus_id = str(strategy.get("corpus_id") or strategy["id"]) + raw_key = strategy.get("corpus_cache_key") + if raw_key: + key = str(raw_key).format( + domain=domain, + strategy_id=strategy["id"], + corpus_id=corpus_id, + ) + else: + key = f"{domain}_{corpus_id}" + if train_num_tasks is not None: + key = f"{key}_train{train_num_tasks}" + return key + + +def _memory_corpus_dir(config: dict[str, Any], configured_run_id: str, corpus_key: str) -> Path: + raw = config.get("paths", {}).get("corpus_cache_dir") + if raw: + return resolve_path(str(raw)) / corpus_key + return output_dir(config, configured_run_id) / "memory_corpora" / corpus_key + + +def _manifest_openviking_identity(corpus_dir: Path) -> dict[str, str] | None: + manifest_path = corpus_dir / "corpus_manifest.json" + if not manifest_path.is_file(): + return None + try: + manifest = json.loads(manifest_path.read_text(encoding="utf-8")) + except json.JSONDecodeError: + return None + openviking = manifest.get("openviking") or {} + required = ("account", "user", "agent_id", "search_uri") + if not all(openviking.get(key) for key in required): + return None + return {key: str(openviking[key]) for key in required} + + def _metrics_from_tau2_results(results_path: Path) -> dict[str, Any]: data = json.loads(results_path.read_text(encoding="utf-8")) assert_tau2_results_complete(data, context=str(results_path)) @@ -155,15 +198,38 @@ def _tau2_command( ): openviking = config["openviking"] corpus_id = str(strategy.get("corpus_id") or strategy["id"]) - account = f"{openviking['account']}-{configured_run_id}-{domain}-{corpus_id}" - agent_id = f"{openviking['agent_id']}-{domain}-{corpus_id}" - user = f"tau2-{domain}-{corpus_id}" + resolved_train_num_tasks = ( + train_num_tasks if train_num_tasks is not None else strategy.get("train_num_tasks") + ) + corpus_key = _memory_corpus_key_for( + domain=domain, + strategy=strategy, + train_num_tasks=resolved_train_num_tasks, + ) + corpus_dir = _memory_corpus_dir(config, configured_run_id, corpus_key) + reuse_identity = _manifest_openviking_identity(corpus_dir) + if reuse_identity is not None: + account = reuse_identity["account"] + agent_id = reuse_identity["agent_id"] + user = reuse_identity["user"] + search_uri = reuse_identity["search_uri"] + elif openviking.get("reuse_corpus_across_runs", False): + account = f"{openviking['account']}-{corpus_key}" + agent_id = f"{openviking['agent_id']}-{domain}-{corpus_id}" + user = f"tau2-{domain}-{corpus_id}" + search_uri = "" + else: + account = f"{openviking['account']}-{configured_run_id}-{domain}-{corpus_id}" + agent_id = f"{openviking['agent_id']}-{domain}-{corpus_id}" + user = f"tau2-{domain}-{corpus_id}" + search_uri = "" search_memory_type = str(strategy.get("search_memory_type", "experiences")) if search_memory_type not in {"experiences", "trajectories"}: raise ValueError( f"Unsupported search_memory_type for {strategy['id']}: {search_memory_type}" ) - search_uri = f"viking://agent/{agent_id}/memories/{search_memory_type}" + if not search_uri: + search_uri = f"viking://agent/{agent_id}/memories/{search_memory_type}" budget = _retrieval_budget(config, strategy) command = [ sys.executable, @@ -173,7 +239,7 @@ def _tau2_command( "--run-dir", str(output_dir(config, configured_run_id) / "memory_cells" / run_label), "--corpus-dir", - str(output_dir(config, configured_run_id) / "memory_corpora" / f"{domain}_{corpus_id}"), + str(corpus_dir), "--run-label", run_label, "--strategy-id", @@ -228,11 +294,8 @@ def _tau2_command( command.extend(["--task-id", task_id]) elif num_tasks is not None: command.extend(["--num-tasks", str(num_tasks)]) - train_num_tasks = ( - train_num_tasks if train_num_tasks is not None else strategy.get("train_num_tasks") - ) - if train_num_tasks is not None: - command.extend(["--train-num-tasks", str(train_num_tasks)]) + if resolved_train_num_tasks is not None: + command.extend(["--train-num-tasks", str(resolved_train_num_tasks)]) return command if strategy.get("memory_backend") != "none": @@ -360,6 +423,30 @@ def _build_plan( "train_required": bool(strategy.get("train_required")), "memory_backend": strategy.get("memory_backend"), "corpus_id": strategy.get("corpus_id", strategy["id"]), + "corpus_key": _memory_corpus_key_for( + domain=domain, + strategy=strategy, + train_num_tasks=( + train_num_tasks + if train_num_tasks is not None + else strategy.get("train_num_tasks") + ), + ), + "corpus_dir": str( + _memory_corpus_dir( + config, + configured_run_id, + _memory_corpus_key_for( + domain=domain, + strategy=strategy, + train_num_tasks=( + train_num_tasks + if train_num_tasks is not None + else strategy.get("train_num_tasks") + ), + ), + ) + ), "retrieval_mode": strategy.get("retrieval_mode"), "retrieval_budget": _retrieval_budget(config, strategy), "search_memory_type": strategy.get("search_memory_type", "experiences"), @@ -401,8 +488,7 @@ def _cell_artifacts(cell: dict[str, Any], repo: Path, out: Path) -> dict[str, st } if cell.get("memory_backend") == "none": return artifacts - corpus_id = str(cell.get("corpus_id") or cell["strategy_id"]) - corpus_dir = out / "memory_corpora" / f"{cell['domain']}_{corpus_id}" + corpus_dir = Path(cell["corpus_dir"]) artifacts["retrieval_trace"] = str(run_dir / f"{cell['run_label']}.retrieval_trace.jsonl") artifacts["corpus_manifest"] = str(corpus_dir / "corpus_manifest.json") return artifacts @@ -422,8 +508,7 @@ def _cell_metrics(cell: dict[str, Any], artifacts: dict[str, str]) -> dict[str, def _memory_corpus_key(cell: dict[str, Any]) -> str: - corpus_id = str(cell.get("corpus_id") or cell["strategy_id"]) - return f"{cell['domain']}_{corpus_id}" + return str(cell.get("corpus_key") or f"{cell['domain']}_{cell['corpus_id']}") def _tau2_subprocess_env(repo: Path) -> dict[str, str]: @@ -439,6 +524,20 @@ def _tau2_subprocess_env(repo: Path) -> dict[str, str]: def _prepare_memory_corpus(cell: dict[str, Any], repo: Path, out: Path) -> dict[str, Any]: key = _memory_corpus_key(cell) + manifest_path = Path(cell["corpus_dir"]) / "corpus_manifest.json" + if manifest_path.is_file(): + row = { + "domain": cell["domain"], + "strategy_id": cell["strategy_id"], + "corpus_id": str(cell.get("corpus_id") or cell["strategy_id"]), + "corpus_key": key, + "returncode": 0, + "reused": True, + "artifacts": {"corpus_manifest": str(manifest_path)}, + } + write_json(out / "corpus_prepare_results" / f"{key}.json", row) + print(f"[tau2] reusing corpus {key}", flush=True) + return row command = list(cell["command"]) + ["--prepare-corpus-only"] print(f"[tau2] preparing corpus {key}", flush=True) completed = subprocess.run( @@ -450,18 +549,16 @@ def _prepare_memory_corpus(cell: dict[str, Any], repo: Path, out: Path) -> dict[ stderr=subprocess.PIPE, check=False, ) - corpus_id = str(cell.get("corpus_id") or cell["strategy_id"]) row = { "domain": cell["domain"], "strategy_id": cell["strategy_id"], - "corpus_id": corpus_id, + "corpus_id": str(cell.get("corpus_id") or cell["strategy_id"]), + "corpus_key": key, "returncode": completed.returncode, "stdout_tail": completed.stdout[-4000:], "stderr_tail": completed.stderr[-4000:], "artifacts": { - "corpus_manifest": str( - out / "memory_corpora" / f"{cell['domain']}_{corpus_id}" / "corpus_manifest.json" - ) + "corpus_manifest": str(Path(cell["corpus_dir"]) / "corpus_manifest.json") }, } write_json(out / "corpus_prepare_results" / f"{key}.json", row) From 436e2a4fb7d1f3db72d9fb2494009a8009860a65 Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Thu, 14 May 2026 15:45:36 +0800 Subject: [PATCH 37/42] bench(tau2): add custom S84 category runner --- benchmark/tau2/config/category_rerank.yaml | 3 +- .../tau2/config/custom_s84_category.yaml | 50 +++++ .../scope_prompts/airline_memory_scope.md | 18 ++ .../scope_prompts/retail_memory_scope.md | 16 ++ benchmark/tau2/config/trajectory.yaml | 11 + benchmark/tau2/scripts/run_eval.py | 199 ++++++++++++++---- benchmark/tau2/scripts/run_memory_v2_eval.py | 37 ++++ tests/benchmark/test_tau2_category_rerank.py | 49 ++++- 8 files changed, 345 insertions(+), 38 deletions(-) create mode 100644 benchmark/tau2/config/custom_s84_category.yaml create mode 100644 benchmark/tau2/config/scope_prompts/airline_memory_scope.md create mode 100644 benchmark/tau2/config/scope_prompts/retail_memory_scope.md diff --git a/benchmark/tau2/config/category_rerank.yaml b/benchmark/tau2/config/category_rerank.yaml index 86cde2aff4..f098f203c5 100644 --- a/benchmark/tau2/config/category_rerank.yaml +++ b/benchmark/tau2/config/category_rerank.yaml @@ -35,4 +35,5 @@ strategies: enabled: true injection_point: system_prompt domain_files: - retail: benchmark/tau2/config/scope_prompts/retail_same_order_variant_guard.md + retail: benchmark/tau2/config/scope_prompts/retail_memory_scope.md + airline: benchmark/tau2/config/scope_prompts/airline_memory_scope.md diff --git a/benchmark/tau2/config/custom_s84_category.yaml b/benchmark/tau2/config/custom_s84_category.yaml new file mode 100644 index 0000000000..db5ff11881 --- /dev/null +++ b/benchmark/tau2/config/custom_s84_category.yaml @@ -0,0 +1,50 @@ +extends: baseline.yaml + +benchmark: + name: tau2_openviking_custom_s84_scope_category + cell_timeout_seconds: 10800 + +openviking: + url: ${OPENVIKING_CUSTOM_URL:-http://localhost:1950} + +eval: + fixed_first_user_fixtures: + retail: /Users/bytedance/Documents/agent-harness/outputs/open_benchmarks/tau2_fixed_first_user_fixtures_v0/tau2_retail_fixed_first_user_from_agent_full_20260502/fixed_first_user_fixture.json + airline: /Users/bytedance/Documents/agent-harness/outputs/open_benchmarks/tau2_fixed_first_user_fixtures_v0/tau2_airline_fixed_first_user_from_no_memory_full_20260503/fixed_first_user_fixture.json + +strategies: + - id: custom_s84_scope_category_positive_match + label: Custom procedure L2 S84 scope prompt + category positive-match + memory_backend: openviking + train_required: false + corpus_id: custom_procedure_steps_l2_20260510 + train_memory_mode: external_procedure_l2 + search_memory_type: procedures + retrieval_mode: first_user_prewrite + external_openviking: + retail: + url: ${OPENVIKING_CUSTOM_URL:-http://localhost:1950} + account: agent-harness-custom-procedure-steps-retail-20260510 + user: tau2-retail-custom-procedure-steps-user-20260510 + agent_id: tau2-retail-custom-procedure-steps-agent-20260510 + search_uri: viking://agent/tau2-retail-custom-procedure-steps-agent-20260510/memories/procedures/retail + airline: + url: ${OPENVIKING_CUSTOM_URL:-http://localhost:1950} + account: agent-harness-custom-procedure-steps-airline-20260510 + user: tau2-airline-custom-procedure-steps-user-20260510 + agent_id: tau2-airline-custom-procedure-steps-agent-20260510 + search_uri: viking://agent/tau2-airline-custom-procedure-steps-agent-20260510/memories/procedures/airline + scope_prompt_files: + retail: benchmark/tau2/config/scope_prompts/retail_memory_scope.md + airline: benchmark/tau2/config/scope_prompts/airline_memory_scope.md + category_rerank: + enabled: true + catalog_path: benchmark/tau2/config/category_catalog.json + apply_nodes: + - before_write_tool_call + retrieve_limit: 6 + inject_limit: 2 + mismatch_policy: keep_positive_match_drop_mismatch + positive_match_required: true + no_match_policy: skip_injection + search_score_weight: 0.0 diff --git a/benchmark/tau2/config/scope_prompts/airline_memory_scope.md b/benchmark/tau2/config/scope_prompts/airline_memory_scope.md new file mode 100644 index 0000000000..8847796a97 --- /dev/null +++ b/benchmark/tau2/config/scope_prompts/airline_memory_scope.md @@ -0,0 +1,18 @@ + +OpenViking memories are advisory. Use them only when their trigger, preconditions, +and applicability boundary match the current airline task. + +- Do not broaden the user's requested booking, cancellation, rebooking, flight + update, passenger update, baggage update, insurance, or payment scope because a + retrieved memory describes a nearby workflow. +- Keep the current reservation scope explicit. Only use flights, passengers, + baggage entries, cabin changes, insurance choices, payment IDs, dates, and + amounts that are grounded in user input, recent tool observations, reservation + state, profile/payment state, or an explicit search/lookup result. +- Before a write tool call, verify that the selected write action matches the + user's requested operation. Do not mix cancellation, rebooking, upgrade, + downgrade, baggage, or passenger-update flows unless the current task asks for + that combined operation. +- If a memory and the current task disagree, follow the current task state and the + domain policy. + diff --git a/benchmark/tau2/config/scope_prompts/retail_memory_scope.md b/benchmark/tau2/config/scope_prompts/retail_memory_scope.md new file mode 100644 index 0000000000..65a8d61fff --- /dev/null +++ b/benchmark/tau2/config/scope_prompts/retail_memory_scope.md @@ -0,0 +1,16 @@ + +OpenViking memories are advisory. Use them only when their trigger, preconditions, +and applicability boundary match the current retail task. + +- Do not broaden the user's requested replacement, return, exchange, cancellation, + address-change, or payment scope because a retrieved memory describes a nearby + workflow. +- If the user restricts the request to the current order, same order, observed + order items, or a specific product variant, choose write arguments only from the + current tool observations or an explicitly requested catalog lookup. +- Before a write tool call, order IDs, item IDs, new item IDs, payment method IDs, + addresses, amounts, and refund/payment direction must be grounded in user input, + recent tool observations, profile/order state, or an explicit catalog lookup. +- If a memory and the current task disagree, follow the current task state and the + domain policy. + diff --git a/benchmark/tau2/config/trajectory.yaml b/benchmark/tau2/config/trajectory.yaml index 5aad55fae1..aabded08cf 100644 --- a/benchmark/tau2/config/trajectory.yaml +++ b/benchmark/tau2/config/trajectory.yaml @@ -20,3 +20,14 @@ strategies: train_memory_mode: experience_only search_memory_type: trajectories retrieval_mode: first_user_prewrite + - id: memory_v2_trajectory_prewrite_scope + label: OpenViking Memory V2 trajectory-view pre-write recall with scope prompt + memory_backend: openviking + train_required: true + corpus_id: memory_v2_trajectory_view + train_memory_mode: experience_only + search_memory_type: trajectories + retrieval_mode: first_user_prewrite + scope_prompt_files: + retail: benchmark/tau2/config/scope_prompts/retail_memory_scope.md + airline: benchmark/tau2/config/scope_prompts/airline_memory_scope.md diff --git a/benchmark/tau2/scripts/run_eval.py b/benchmark/tau2/scripts/run_eval.py index 3656b056e3..d5d5f9a91d 100755 --- a/benchmark/tau2/scripts/run_eval.py +++ b/benchmark/tau2/scripts/run_eval.py @@ -161,6 +161,22 @@ def _memory_corpus_dir(config: dict[str, Any], configured_run_id: str, corpus_ke return output_dir(config, configured_run_id) / "memory_corpora" / corpus_key +def _domain_value(value: Any, domain: str) -> Any: + if isinstance(value, dict): + return value.get(domain) or value.get(str(domain).lower()) or value.get("default") + return value + + +def _search_uri_suffix(search_memory_type: str, domain: str) -> str: + if search_memory_type in {"experiences", "trajectories"}: + return search_memory_type + if search_memory_type == "procedures": + return f"procedures/{domain}" + if search_memory_type.startswith("procedures/"): + return search_memory_type.format(domain=domain) + raise ValueError(f"Unsupported search_memory_type: {search_memory_type}") + + def _manifest_openviking_identity(corpus_dir: Path) -> dict[str, str] | None: manifest_path = corpus_dir / "corpus_manifest.json" if not manifest_path.is_file(): @@ -176,6 +192,40 @@ def _manifest_openviking_identity(corpus_dir: Path) -> dict[str, str] | None: return {key: str(openviking[key]) for key in required} +def _external_openviking_identity( + config: dict[str, Any], + strategy: dict[str, Any], + domain: str, +) -> dict[str, str]: + openviking = config.get("openviking", {}) + raw_identity = strategy.get("external_openviking") or strategy.get("openviking_identity") or {} + identity = _domain_value(raw_identity, domain) + identity = identity if isinstance(identity, dict) else {} + values = { + "url": identity.get("url") + or _domain_value(strategy.get("openviking_url"), domain) + or openviking.get("url"), + "account": identity.get("account") + or _domain_value(strategy.get("openviking_account"), domain) + or _domain_value(openviking.get("account"), domain), + "user": identity.get("user") + or _domain_value(strategy.get("openviking_user"), domain) + or _domain_value(openviking.get("user"), domain), + "agent_id": identity.get("agent_id") + or _domain_value(strategy.get("openviking_agent_id"), domain) + or _domain_value(openviking.get("agent_id"), domain), + "search_uri": identity.get("search_uri") + or _domain_value(strategy.get("search_uri"), domain) + or "", + } + missing = [key for key in ("url", "account", "user", "agent_id") if not values.get(key)] + if missing: + raise ValueError( + f"external OpenViking identity for {strategy['id']} {domain} missing: {missing}" + ) + return {key: str(value) for key, value in values.items()} + + def _metrics_from_tau2_results(results_path: Path) -> dict[str, Any]: data = json.loads(results_path.read_text(encoding="utf-8")) assert_tau2_results_complete(data, context=str(results_path)) @@ -214,11 +264,9 @@ def _tau2_command( agent_llm_args = f'{{"temperature":0.0,"reasoning_effort":"{reasoning_effort}"}}' user_llm_args = f'{{"temperature":0.0,"reasoning_effort":"{reasoning_effort}"}}' fixed_first_user_file = _fixed_first_user_file(config, domain) + scope_prompt_file = _scope_prompt_file(config, strategy, domain) - if ( - strategy.get("memory_backend") == "openviking" - and strategy.get("train_memory_mode") == "experience_only" - ): + if strategy.get("memory_backend") == "openviking": openviking = config["openviking"] corpus_id = str(strategy.get("corpus_id") or strategy["id"]) resolved_train_num_tasks = ( @@ -230,29 +278,40 @@ def _tau2_command( train_num_tasks=resolved_train_num_tasks, ) corpus_dir = _memory_corpus_dir(config, configured_run_id, corpus_key) - reuse_identity = _manifest_openviking_identity(corpus_dir) - if reuse_identity is not None: - account = reuse_identity["account"] - agent_id = reuse_identity["agent_id"] - user = reuse_identity["user"] - search_uri = reuse_identity["search_uri"] - elif openviking.get("reuse_corpus_across_runs", False): - account = f"{openviking['account']}-{corpus_key}" - agent_id = f"{openviking['agent_id']}-{domain}-{corpus_id}" - user = f"tau2-{domain}-{corpus_id}" - search_uri = "" + train_memory_mode = str(strategy.get("train_memory_mode") or "") + skip_train = False + openviking_url = str(openviking["url"]) + if train_memory_mode == "experience_only": + reuse_identity = _manifest_openviking_identity(corpus_dir) + if reuse_identity is not None: + account = reuse_identity["account"] + agent_id = reuse_identity["agent_id"] + user = reuse_identity["user"] + search_uri = reuse_identity["search_uri"] + elif openviking.get("reuse_corpus_across_runs", False): + account = f"{openviking['account']}-{corpus_key}" + agent_id = f"{openviking['agent_id']}-{domain}-{corpus_id}" + user = f"tau2-{domain}-{corpus_id}" + search_uri = "" + else: + account = f"{openviking['account']}-{configured_run_id}-{domain}-{corpus_id}" + agent_id = f"{openviking['agent_id']}-{domain}-{corpus_id}" + user = f"tau2-{domain}-{corpus_id}" + search_uri = "" + elif train_memory_mode in {"external", "external_procedure_l2", "custom_procedure_l2"}: + identity = _external_openviking_identity(config, strategy, domain) + openviking_url = identity["url"] + account = identity["account"] + agent_id = identity["agent_id"] + user = identity["user"] + search_uri = identity.get("search_uri", "") + skip_train = True else: - account = f"{openviking['account']}-{configured_run_id}-{domain}-{corpus_id}" - agent_id = f"{openviking['agent_id']}-{domain}-{corpus_id}" - user = f"tau2-{domain}-{corpus_id}" - search_uri = "" + return None search_memory_type = str(strategy.get("search_memory_type", "experiences")) - if search_memory_type not in {"experiences", "trajectories"}: - raise ValueError( - f"Unsupported search_memory_type for {strategy['id']}: {search_memory_type}" - ) + search_uri_suffix = _search_uri_suffix(search_memory_type, domain) if not search_uri: - search_uri = f"viking://agent/{agent_id}/memories/{search_memory_type}" + search_uri = f"viking://agent/{agent_id}/memories/{search_uri_suffix}" budget = _retrieval_budget(config, strategy) category_rerank = strategy.get("category_rerank") category_rerank = category_rerank if isinstance(category_rerank, dict) else {} @@ -290,7 +349,7 @@ def _tau2_command( "--user-llm-args", user_llm_args, "--openviking-url", - str(openviking["url"]), + openviking_url, "--openviking-account", account, "--openviking-user", @@ -314,6 +373,8 @@ def _tau2_command( "--seed", str(seed), ] + if skip_train: + command.append("--skip-train") if fixed_first_user_file is not None: command.extend(["--fixed-first-user-file", str(fixed_first_user_file)]) if category_rerank.get("enabled"): @@ -323,6 +384,8 @@ def _tau2_command( json.dumps(category_rerank, ensure_ascii=False, sort_keys=True), ] ) + if scope_prompt_file is not None: + command.extend(["--scope-prompt-file", str(scope_prompt_file)]) if scope_prompt.get("enabled"): command.extend( [ @@ -400,6 +463,23 @@ def _fixed_first_user_file(config: dict[str, Any], domain: str) -> Path | None: return resolve_path(str(raw)) +def _scope_prompt_file( + config: dict[str, Any], strategy: dict[str, Any], domain: str +) -> Path | None: + raw = strategy.get("scope_prompt_file") + if raw is None: + raw = strategy.get("scope_prompt_files") + if raw is None: + raw = config.get("openviking", {}).get("scope_prompt_file") + if raw is None: + raw = config.get("openviking", {}).get("scope_prompt_files") + if isinstance(raw, dict): + raw = raw.get(domain) or raw.get("default") + if raw is None or str(raw).strip() == "": + return None + return resolve_path(str(raw)) + + def _build_plan( config: dict[str, Any], configured_run_id: str, @@ -413,6 +493,7 @@ def _build_plan( ) -> dict[str, Any]: repeat_count = repeat_count_override or int(config["benchmark"].get("repeat_count", 8)) base_seed = int(config["benchmark"].get("seed", 300)) + cell_timeout_seconds = int(config["benchmark"].get("cell_timeout_seconds", 0) or 0) policy_report = simulator_policy_report(config) strategies = config.get("strategies") or [] if selected_strategy_ids: @@ -447,6 +528,7 @@ def _build_plan( seed=seed, ) fixed_first_user_file = _fixed_first_user_file(config, domain) + scope_prompt_file = _scope_prompt_file(config, strategy, domain) non_executable_reason = None if command is None: non_executable_reason = ( @@ -500,6 +582,7 @@ def _build_plan( "fixed_first_user_file": str(fixed_first_user_file) if fixed_first_user_file else None, + "scope_prompt_file": str(scope_prompt_file) if scope_prompt_file else None, "split_file": str(split_path), "command": command, "non_executable_reason": non_executable_reason, @@ -518,6 +601,7 @@ def _build_plan( "executable_cell_count": executable_cell_count, "pending_cell_count": len(cells) - executable_cell_count, "corpus_prepare_concurrency": int(config["benchmark"].get("corpus_prepare_concurrency", 1)), + "cell_timeout_seconds": cell_timeout_seconds or None, "cells": cells, } @@ -742,22 +826,55 @@ def _execute_cells(plan: dict[str, Any], repo: Path, out: Path) -> list[dict[str ) _prepare_memory_corpora(plan, repo, out) rows = [] + cell_timeout = int(plan.get("cell_timeout_seconds") or 0) or None for cell in plan["cells"]: if not cell.get("executable"): raise RuntimeError( f"cell is not executable yet: {cell['run_label']} " f"(strategy_id={cell['strategy_id']}, adapter_status={cell.get('adapter_status')})" ) + cell_result_path = out / "cell_results" / f"{cell['run_label']}.json" + if cell_result_path.is_file(): + existing_row = json.loads(cell_result_path.read_text(encoding="utf-8")) + if existing_row.get("returncode") == 0 and existing_row.get("metrics"): + print(f"[tau2] skipping completed {cell['run_label']}") + rows.append(existing_row) + continue print(f"[tau2] running {cell['run_label']}") - completed = subprocess.run( - cell["command"], - cwd=repo, - env=_tau2_subprocess_env(repo), - text=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - check=False, - ) + try: + completed = subprocess.run( + cell["command"], + cwd=repo, + env=_tau2_subprocess_env(repo), + text=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + check=False, + timeout=cell_timeout, + ) + except subprocess.TimeoutExpired as exc: + stdout = exc.stdout or "" + stderr = exc.stderr or "" + if isinstance(stdout, bytes): + stdout = stdout.decode(errors="replace") + if isinstance(stderr, bytes): + stderr = stderr.decode(errors="replace") + row = { + "run_label": cell["run_label"], + "domain": cell["domain"], + "strategy_id": cell["strategy_id"], + "returncode": 124, + "timed_out": True, + "timeout_seconds": cell_timeout, + "stdout_tail": stdout[-4000:], + "stderr_tail": stderr[-4000:], + "artifacts": _cell_artifacts(cell, repo, out), + "metrics": None, + } + write_json(cell_result_path, row) + raise RuntimeError( + f"cell timed out after {cell_timeout}s: {cell['run_label']}" + ) from exc row = { "run_label": cell["run_label"], "domain": cell["domain"], @@ -770,7 +887,7 @@ def _execute_cells(plan: dict[str, Any], repo: Path, out: Path) -> list[dict[str row["metrics"] = _cell_metrics(cell, row["artifacts"]) row["runtime_evidence"] = _cell_runtime_evidence(cell, row["artifacts"]) rows.append(row) - write_json(out / "cell_results" / f"{cell['run_label']}.json", row) + write_json(cell_result_path, row) if completed.returncode != 0: raise RuntimeError( f"cell failed: {cell['run_label']} returncode={completed.returncode}" @@ -841,14 +958,24 @@ def _preflight(config: dict[str, Any], out: Path, *, strict: bool) -> int: scope_prompt_rows = [] for strategy in config.get("strategies") or []: scope_prompt = strategy.get("scope_prompt") - if not isinstance(scope_prompt, dict) or not scope_prompt.get("enabled"): + direct_scope_prompt = ( + strategy.get("scope_prompt_file") + or strategy.get("scope_prompt_files") + or config.get("openviking", {}).get("scope_prompt_file") + or config.get("openviking", {}).get("scope_prompt_files") + ) + has_scope_prompt_config = isinstance(scope_prompt, dict) and scope_prompt.get("enabled") + if not has_scope_prompt_config and not direct_scope_prompt: continue + scope_prompt = scope_prompt if isinstance(scope_prompt, dict) else {} domain_files = scope_prompt.get("domain_files") domain_files = domain_files if isinstance(domain_files, dict) else {} domain_texts = scope_prompt.get("domain_texts") domain_texts = domain_texts if isinstance(domain_texts, dict) else {} for domain in domains(config): raw_prompt_path = domain_files.get(domain) + if raw_prompt_path is None: + raw_prompt_path = _domain_value(direct_scope_prompt, domain) prompt_path = None exists = False if raw_prompt_path: diff --git a/benchmark/tau2/scripts/run_memory_v2_eval.py b/benchmark/tau2/scripts/run_memory_v2_eval.py index 976e095417..f7ca6d89be 100644 --- a/benchmark/tau2/scripts/run_memory_v2_eval.py +++ b/benchmark/tau2/scripts/run_memory_v2_eval.py @@ -736,6 +736,30 @@ def _train(args: argparse.Namespace, train_results: Path, corpus_manifest: Path) if corpus_manifest.is_file() and not args.force_train: return json.loads(corpus_manifest.read_text()) + if args.skip_train: + client = _client(args) + try: + corpus_probe = _probe_corpus(args, client) + finally: + client.close() + manifest = { + "domain": args.domain, + "train_results": None, + "external_corpus": True, + "openviking": { + "url": args.openviking_url, + "account": args.openviking_account, + "user": args.openviking_user, + "agent_id": args.openviking_agent_id, + "search_uri": args.search_uri, + }, + "committed_sessions": [], + "committed_session_count": 0, + "corpus_probe": corpus_probe, + } + _write_json(corpus_manifest, manifest) + return manifest + _run_tau2( tau2_repo=args.tau2_repo, domain=args.domain, @@ -1114,6 +1138,7 @@ def main() -> int: parser.add_argument("--prewrite-retrieval-top-k", type=int) parser.add_argument("--prewrite-inject-top-k", type=int) parser.add_argument("--fixed-first-user-file", type=Path) + parser.add_argument("--scope-prompt-file", type=Path) parser.add_argument( "--retrieval-mode", choices=["first_user", "prewrite", "first_user_prewrite"], @@ -1121,6 +1146,7 @@ def main() -> int: ) parser.add_argument("--category-rerank-config", type=_json, default={}) parser.add_argument("--scope-prompt-config", type=_json, default={}) + parser.add_argument("--skip-train", action="store_true") parser.add_argument("--force-train", action="store_true") parser.add_argument("--prepare-corpus-only", action="store_true") parser.add_argument( @@ -1170,6 +1196,16 @@ def main() -> int: args.prewrite_inject_top_k = args.prewrite_inject_top_k or args.prewrite_retrieval_top_k if args.fixed_first_user_file is not None: args.fixed_first_user_file = args.fixed_first_user_file.expanduser().resolve() + if args.scope_prompt_file is not None: + args.scope_prompt_file = args.scope_prompt_file.expanduser().resolve() + if not args.scope_prompt_file.is_file(): + parser.error(f"--scope-prompt-file does not exist: {args.scope_prompt_file}") + if isinstance(args.scope_prompt_config, dict) and args.scope_prompt_config.get("enabled"): + parser.error("--scope-prompt-file and enabled --scope-prompt-config are mutually exclusive") + args.scope_prompt_config = { + "enabled": True, + "domain_files": {args.domain: str(args.scope_prompt_file)}, + } train_results = corpus_dir / "train_results.json" corpus_manifest = corpus_dir / "corpus_manifest.json" eval_results = args.run_dir / f"{args.run_label}.json" @@ -1272,6 +1308,7 @@ def main() -> int: "fixed_first_user_file": str(args.fixed_first_user_file) if args.fixed_first_user_file else None, + "scope_prompt_file": str(args.scope_prompt_file) if args.scope_prompt_file else None, "corpus": corpus, "category_rerank": category_summary, "scope_prompt": args.scope_prompt_summary, diff --git a/tests/benchmark/test_tau2_category_rerank.py b/tests/benchmark/test_tau2_category_rerank.py index 052a7d639c..9fd3120b40 100644 --- a/tests/benchmark/test_tau2_category_rerank.py +++ b/tests/benchmark/test_tau2_category_rerank.py @@ -74,7 +74,8 @@ def test_category_rerank_config_matches_s89_alignment_shape() -> None: assert scope_prompt["enabled"] is True assert scope_prompt["injection_point"] == "system_prompt" assert scope_prompt["domain_files"] == { - "retail": "benchmark/tau2/config/scope_prompts/retail_same_order_variant_guard.md" + "retail": "benchmark/tau2/config/scope_prompts/retail_memory_scope.md", + "airline": "benchmark/tau2/config/scope_prompts/airline_memory_scope.md", } assert "memory_v2_trajectory_prewrite" in strategies @@ -82,6 +83,52 @@ def test_category_rerank_config_matches_s89_alignment_shape() -> None: assert not _has_key_fragment(category_strategy, "sidecar") +def test_custom_s84_config_uses_external_procedure_corpus() -> None: + repo_root = Path(__file__).resolve().parents[2] + config = load_config(repo_root / "benchmark/tau2/config/custom_s84_category.yaml") + strategy = config["strategies"][0] + + assert strategy["id"] == "custom_s84_scope_category_positive_match" + assert strategy["train_required"] is False + assert strategy["train_memory_mode"] == "external_procedure_l2" + assert strategy["search_memory_type"] == "procedures" + assert strategy["retrieval_mode"] == "first_user_prewrite" + assert strategy["scope_prompt_files"] == { + "retail": "benchmark/tau2/config/scope_prompts/retail_memory_scope.md", + "airline": "benchmark/tau2/config/scope_prompts/airline_memory_scope.md", + } + assert strategy["external_openviking"]["retail"]["search_uri"].endswith( + "/memories/procedures/retail" + ) + assert strategy["external_openviking"]["airline"]["search_uri"].endswith( + "/memories/procedures/airline" + ) + + +def test_tau2_command_supports_custom_s84_external_procedure_strategy() -> None: + repo_root = Path(__file__).resolve().parents[2] + config = load_config(repo_root / "benchmark/tau2/config/custom_s84_category.yaml") + strategy = config["strategies"][0] + command = _tau2_command( + config, + domain="retail", + strategy=strategy, + configured_run_id="unit", + run_label="unit_retail_s84_r1", + task_ids=["5"], + num_tasks=None, + train_num_tasks=None, + seed=303, + ) + + assert command is not None + assert "--skip-train" in command + assert "--scope-prompt-file" in command + assert command[command.index("--search-uri") + 1].endswith("/memories/procedures/retail") + assert command[command.index("--prewrite-retrieval-top-k") + 1] == "6" + assert command[command.index("--prewrite-inject-top-k") + 1] == "2" + + def test_category_rerank_keeps_positive_category_match() -> None: rows = [ { From d833980e9a33efdcd908df191ffbbac0e4cfa317 Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Thu, 14 May 2026 16:32:47 +0800 Subject: [PATCH 38/42] bench(tau2): add scoped trajectory eval concurrency --- benchmark/tau2/README.md | 8 + benchmark/tau2/config/baseline.yaml | 1 + .../scope_prompts/airline_memory_scope.md | 18 ++ .../scope_prompts/retail_memory_scope.md | 16 ++ benchmark/tau2/config/trajectory.yaml | 11 ++ benchmark/tau2/scripts/run_eval.py | 169 +++++++++++++++--- benchmark/tau2/scripts/run_memory_v2_eval.py | 23 ++- 7 files changed, 211 insertions(+), 35 deletions(-) create mode 100644 benchmark/tau2/config/scope_prompts/airline_memory_scope.md create mode 100644 benchmark/tau2/config/scope_prompts/retail_memory_scope.md diff --git a/benchmark/tau2/README.md b/benchmark/tau2/README.md index 998c93d440..45d947cc26 100644 --- a/benchmark/tau2/README.md +++ b/benchmark/tau2/README.md @@ -133,6 +133,9 @@ Memory V2 cells run through a small TAU-2 agent adapter in this directory: regenerate that step with the matched memories. The default benchmark retrieves 6 pre-write candidates and injects 2, which keeps extra candidates visible in traces without expanding the prompt budget; +- optionally run an explicit scope-prompt treatment that keeps retrieved + memories advisory and asks the agent to preserve the current task scope before + write-like tool calls; - emit artifact metadata to identify the OpenViking account, agent, corpus, retrieval mode, and simulator policy used by each cell. @@ -145,6 +148,11 @@ is retrieved during eval (`experiences` by default, `trajectories` for parallel with `benchmark.corpus_prepare_concurrency`; session commits inside one corpus remain serial to preserve OpenViking write semantics. +Eval cells run in parallel with `benchmark.strategy_concurrency` by default and +can be overridden with `--strategy-concurrency`. This only parallelizes read-only +TAU-2 eval cells; corpus writes inside one corpus are still serialized by the +prepare step. + ## User Simulator Policy The runner default is the official TAU-2 user simulator if diff --git a/benchmark/tau2/config/baseline.yaml b/benchmark/tau2/config/baseline.yaml index 22cf4f4ecc..ef692f43a1 100644 --- a/benchmark/tau2/config/baseline.yaml +++ b/benchmark/tau2/config/baseline.yaml @@ -6,6 +6,7 @@ benchmark: train_split_name: train eval_split_name: test repeat_count: 8 + strategy_concurrency: 8 task_max_concurrency: 10 corpus_prepare_concurrency: 2 max_steps: 200 diff --git a/benchmark/tau2/config/scope_prompts/airline_memory_scope.md b/benchmark/tau2/config/scope_prompts/airline_memory_scope.md new file mode 100644 index 0000000000..8847796a97 --- /dev/null +++ b/benchmark/tau2/config/scope_prompts/airline_memory_scope.md @@ -0,0 +1,18 @@ + +OpenViking memories are advisory. Use them only when their trigger, preconditions, +and applicability boundary match the current airline task. + +- Do not broaden the user's requested booking, cancellation, rebooking, flight + update, passenger update, baggage update, insurance, or payment scope because a + retrieved memory describes a nearby workflow. +- Keep the current reservation scope explicit. Only use flights, passengers, + baggage entries, cabin changes, insurance choices, payment IDs, dates, and + amounts that are grounded in user input, recent tool observations, reservation + state, profile/payment state, or an explicit search/lookup result. +- Before a write tool call, verify that the selected write action matches the + user's requested operation. Do not mix cancellation, rebooking, upgrade, + downgrade, baggage, or passenger-update flows unless the current task asks for + that combined operation. +- If a memory and the current task disagree, follow the current task state and the + domain policy. + diff --git a/benchmark/tau2/config/scope_prompts/retail_memory_scope.md b/benchmark/tau2/config/scope_prompts/retail_memory_scope.md new file mode 100644 index 0000000000..65a8d61fff --- /dev/null +++ b/benchmark/tau2/config/scope_prompts/retail_memory_scope.md @@ -0,0 +1,16 @@ + +OpenViking memories are advisory. Use them only when their trigger, preconditions, +and applicability boundary match the current retail task. + +- Do not broaden the user's requested replacement, return, exchange, cancellation, + address-change, or payment scope because a retrieved memory describes a nearby + workflow. +- If the user restricts the request to the current order, same order, observed + order items, or a specific product variant, choose write arguments only from the + current tool observations or an explicitly requested catalog lookup. +- Before a write tool call, order IDs, item IDs, new item IDs, payment method IDs, + addresses, amounts, and refund/payment direction must be grounded in user input, + recent tool observations, profile/order state, or an explicit catalog lookup. +- If a memory and the current task disagree, follow the current task state and the + domain policy. + diff --git a/benchmark/tau2/config/trajectory.yaml b/benchmark/tau2/config/trajectory.yaml index 5aad55fae1..aabded08cf 100644 --- a/benchmark/tau2/config/trajectory.yaml +++ b/benchmark/tau2/config/trajectory.yaml @@ -20,3 +20,14 @@ strategies: train_memory_mode: experience_only search_memory_type: trajectories retrieval_mode: first_user_prewrite + - id: memory_v2_trajectory_prewrite_scope + label: OpenViking Memory V2 trajectory-view pre-write recall with scope prompt + memory_backend: openviking + train_required: true + corpus_id: memory_v2_trajectory_view + train_memory_mode: experience_only + search_memory_type: trajectories + retrieval_mode: first_user_prewrite + scope_prompt_files: + retail: benchmark/tau2/config/scope_prompts/retail_memory_scope.md + airline: benchmark/tau2/config/scope_prompts/airline_memory_scope.md diff --git a/benchmark/tau2/scripts/run_eval.py b/benchmark/tau2/scripts/run_eval.py index 0afa2dfef8..3ecec3fef0 100755 --- a/benchmark/tau2/scripts/run_eval.py +++ b/benchmark/tau2/scripts/run_eval.py @@ -191,6 +191,7 @@ def _tau2_command( agent_llm_args = f'{{"temperature":0.0,"reasoning_effort":"{reasoning_effort}"}}' user_llm_args = f'{{"temperature":0.0,"reasoning_effort":"{reasoning_effort}"}}' fixed_first_user_file = _fixed_first_user_file(config, domain) + scope_prompt_file = _scope_prompt_file(config, strategy, domain) if ( strategy.get("memory_backend") == "openviking" @@ -289,6 +290,8 @@ def _tau2_command( ] if fixed_first_user_file is not None: command.extend(["--fixed-first-user-file", str(fixed_first_user_file)]) + if scope_prompt_file is not None: + command.extend(["--scope-prompt-file", str(scope_prompt_file)]) if task_ids: for task_id in task_ids: command.extend(["--task-id", task_id]) @@ -359,6 +362,23 @@ def _fixed_first_user_file(config: dict[str, Any], domain: str) -> Path | None: return resolve_path(str(raw)) +def _scope_prompt_file( + config: dict[str, Any], strategy: dict[str, Any], domain: str +) -> Path | None: + raw = strategy.get("scope_prompt_file") + if raw is None: + raw = strategy.get("scope_prompt_files") + if raw is None: + raw = config.get("openviking", {}).get("scope_prompt_file") + if raw is None: + raw = config.get("openviking", {}).get("scope_prompt_files") + if isinstance(raw, dict): + raw = raw.get(domain) or raw.get("default") + if raw is None or str(raw).strip() == "": + return None + return resolve_path(str(raw)) + + def _build_plan( config: dict[str, Any], configured_run_id: str, @@ -369,9 +389,20 @@ def _build_plan( num_tasks: int | None, train_num_tasks: int | None, repeat_count_override: int | None, + cell_concurrency_override: int | None, + strategy_concurrency_override: int | None, ) -> dict[str, Any]: repeat_count = repeat_count_override or int(config["benchmark"].get("repeat_count", 8)) base_seed = int(config["benchmark"].get("seed", 300)) + cell_timeout_seconds = int(config["benchmark"].get("cell_timeout_seconds", 0) or 0) + strategy_concurrency = strategy_concurrency_override + if strategy_concurrency is None: + strategy_concurrency = cell_concurrency_override + if strategy_concurrency is None: + strategy_concurrency = config["benchmark"].get("strategy_concurrency") + if strategy_concurrency is None: + strategy_concurrency = config["benchmark"].get("cell_concurrency", 1) + strategy_concurrency = max(1, int(strategy_concurrency or 1)) policy_report = simulator_policy_report(config) strategies = config.get("strategies") or [] if selected_strategy_ids: @@ -406,6 +437,7 @@ def _build_plan( seed=seed, ) fixed_first_user_file = _fixed_first_user_file(config, domain) + scope_prompt_file = _scope_prompt_file(config, strategy, domain) non_executable_reason = None if command is None: non_executable_reason = ( @@ -457,6 +489,7 @@ def _build_plan( "fixed_first_user_file": str(fixed_first_user_file) if fixed_first_user_file else None, + "scope_prompt_file": str(scope_prompt_file) if scope_prompt_file else None, "split_file": str(split_path), "command": command, "non_executable_reason": non_executable_reason, @@ -475,6 +508,9 @@ def _build_plan( "executable_cell_count": executable_cell_count, "pending_cell_count": len(cells) - executable_cell_count, "corpus_prepare_concurrency": int(config["benchmark"].get("corpus_prepare_concurrency", 1)), + "strategy_concurrency": strategy_concurrency, + "cell_concurrency": strategy_concurrency, + "cell_timeout_seconds": cell_timeout_seconds or None, "cells": cells, } @@ -557,9 +593,7 @@ def _prepare_memory_corpus(cell: dict[str, Any], repo: Path, out: Path) -> dict[ "returncode": completed.returncode, "stdout_tail": completed.stdout[-4000:], "stderr_tail": completed.stderr[-4000:], - "artifacts": { - "corpus_manifest": str(Path(cell["corpus_dir"]) / "corpus_manifest.json") - }, + "artifacts": {"corpus_manifest": str(Path(cell["corpus_dir"]) / "corpus_manifest.json")}, } write_json(out / "corpus_prepare_results" / f"{key}.json", row) if completed.returncode != 0: @@ -648,22 +682,16 @@ def weighted(rows_for_group: list[dict[str, Any]]) -> dict[str, Any]: } -def _execute_cells(plan: dict[str, Any], repo: Path, out: Path) -> list[dict[str, Any]]: - policy_report = plan.get("simulator_policy") or {} - if not policy_report.get("supported", False): - raise RuntimeError( - "configured user simulator policy is not supported by this TAU-2 checkout: " - f"{policy_report}" - ) - _prepare_memory_corpora(plan, repo, out) - rows = [] - for cell in plan["cells"]: - if not cell.get("executable"): - raise RuntimeError( - f"cell is not executable yet: {cell['run_label']} " - f"(strategy_id={cell['strategy_id']}, adapter_status={cell.get('adapter_status')})" - ) - print(f"[tau2] running {cell['run_label']}") +def _execute_cell(cell: dict[str, Any], repo: Path, out: Path, cell_timeout: int | None) -> dict[str, Any]: + cell_result_path = out / "cell_results" / f"{cell['run_label']}.json" + if cell_result_path.is_file(): + existing_row = json.loads(cell_result_path.read_text(encoding="utf-8")) + if existing_row.get("returncode") == 0 and existing_row.get("metrics"): + print(f"[tau2] skipping completed {cell['run_label']}", flush=True) + return existing_row + + print(f"[tau2] running {cell['run_label']}", flush=True) + try: completed = subprocess.run( cell["command"], cwd=repo, @@ -672,26 +700,88 @@ def _execute_cells(plan: dict[str, Any], repo: Path, out: Path) -> list[dict[str stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=False, + timeout=cell_timeout, ) + except subprocess.TimeoutExpired as exc: + stdout = exc.stdout or "" + stderr = exc.stderr or "" + if isinstance(stdout, bytes): + stdout = stdout.decode(errors="replace") + if isinstance(stderr, bytes): + stderr = stderr.decode(errors="replace") row = { "run_label": cell["run_label"], "domain": cell["domain"], "strategy_id": cell["strategy_id"], - "returncode": completed.returncode, - "stdout_tail": completed.stdout[-4000:], - "stderr_tail": completed.stderr[-4000:], + "returncode": 124, + "timed_out": True, + "timeout_seconds": cell_timeout, + "stdout_tail": stdout[-4000:], + "stderr_tail": stderr[-4000:], + "artifacts": _cell_artifacts(cell, repo, out), + "metrics": None, } - row["artifacts"] = _cell_artifacts(cell, repo, out) - row["metrics"] = _cell_metrics(cell, row["artifacts"]) - rows.append(row) - write_json(out / "cell_results" / f"{cell['run_label']}.json", row) - if completed.returncode != 0: + write_json(cell_result_path, row) + return row + + row = { + "run_label": cell["run_label"], + "domain": cell["domain"], + "strategy_id": cell["strategy_id"], + "returncode": completed.returncode, + "stdout_tail": completed.stdout[-4000:], + "stderr_tail": completed.stderr[-4000:], + } + row["artifacts"] = _cell_artifacts(cell, repo, out) + row["metrics"] = _cell_metrics(cell, row["artifacts"]) + write_json(cell_result_path, row) + return row + + +def _execute_cells(plan: dict[str, Any], repo: Path, out: Path) -> list[dict[str, Any]]: + policy_report = plan.get("simulator_policy") or {} + if not policy_report.get("supported", False): + raise RuntimeError( + "configured user simulator policy is not supported by this TAU-2 checkout: " + f"{policy_report}" + ) + _prepare_memory_corpora(plan, repo, out) + cells = [] + for cell in plan["cells"]: + if not cell.get("executable"): raise RuntimeError( - f"cell failed: {cell['run_label']} returncode={completed.returncode}" + f"cell is not executable yet: {cell['run_label']} " + f"(strategy_id={cell['strategy_id']}, adapter_status={cell.get('adapter_status')})" ) + cells.append(cell) + + cell_timeout = int(plan.get("cell_timeout_seconds") or 0) or None + worker_count = max( + 1, int(plan.get("strategy_concurrency") or plan.get("cell_concurrency") or 1) + ) + if worker_count == 1 or len(cells) == 1: + return [_execute_cell(cell, repo, out, cell_timeout) for cell in cells] + + print(f"[tau2] running eval cells with concurrency={worker_count}", flush=True) + rows: list[dict[str, Any]] = [] + with ThreadPoolExecutor(max_workers=worker_count) as executor: + futures = { + executor.submit(_execute_cell, cell, repo, out, cell_timeout): cell + for cell in cells + } + for future in as_completed(futures): + rows.append(future.result()) return rows +def _execution_failures(rows: list[dict[str, Any]]) -> list[dict[str, Any]]: + return [ + row + for row in rows + if row.get("returncode") != 0 or row.get("timed_out") or not row.get("metrics") + ] + + def _preflight(config: dict[str, Any], out: Path, *, strict: bool) -> int: errors: list[str] = [] llm_env = normalize_litellm_env() @@ -760,6 +850,16 @@ def main() -> int: parser.add_argument( "--repeat-count", type=int, help="Override benchmark.repeat_count for smoke runs." ) + parser.add_argument( + "--cell-concurrency", + type=int, + help="Deprecated alias for --strategy-concurrency.", + ) + parser.add_argument( + "--strategy-concurrency", + type=int, + help="Override benchmark.strategy_concurrency for parallel matrix cells.", + ) parser.add_argument( "--strategy-id", action="append", help="Run only this strategy id; may be repeated." ) @@ -789,6 +889,10 @@ def main() -> int: if args.plan_only and args.execute: raise SystemExit("--plan-only and --execute are mutually exclusive") + if args.cell_concurrency is not None and args.cell_concurrency < 1: + raise SystemExit("--cell-concurrency must be >= 1") + if args.strategy_concurrency is not None and args.strategy_concurrency < 1: + raise SystemExit("--strategy-concurrency must be >= 1") config = load_config(args.config) out = output_dir(config, args.run_id) @@ -807,6 +911,8 @@ def main() -> int: num_tasks=args.num_tasks, train_num_tasks=args.train_num_tasks, repeat_count_override=args.repeat_count, + cell_concurrency_override=args.cell_concurrency, + strategy_concurrency_override=args.strategy_concurrency, ) write_json(out / "run_plan.json", plan) write_json(out / "resolved_config.json", config) @@ -815,10 +921,15 @@ def main() -> int: if args.execute: try: rows = _execute_cells(plan, tau2_repo(config), out) - plan["status"] = "succeeded" + failures = _execution_failures(rows) + plan["status"] = "failed" if failures else "succeeded" plan["executed_cell_count"] = len(rows) + plan["failed_cell_count"] = len(failures) write_json(out / "run_plan.json", plan) write_json(out / "scoreboard.json", _summarize(rows)) + if failures: + labels = ", ".join(str(row.get("run_label")) for row in failures[:5]) + raise RuntimeError(f"{len(failures)} cell(s) failed or incomplete: {labels}") except Exception as exc: plan["status"] = "failed" plan["error"] = str(exc) diff --git a/benchmark/tau2/scripts/run_memory_v2_eval.py b/benchmark/tau2/scripts/run_memory_v2_eval.py index 5b0cd62003..aab783f1d0 100644 --- a/benchmark/tau2/scripts/run_memory_v2_eval.py +++ b/benchmark/tau2/scripts/run_memory_v2_eval.py @@ -208,7 +208,9 @@ def _append_incoming_user_context(message: Any, state: Any) -> None: state.messages.extend(message.tool_messages) elif isinstance(message, ToolMessage): state.messages.append(message) - elif isinstance(message, AssistantMessage) and (message.has_content() or message.is_tool_call()): + elif isinstance(message, AssistantMessage) and ( + message.has_content() or message.is_tool_call() + ): state.messages.append(message) @@ -229,8 +231,7 @@ def _generate_next_message(self, message: Any, state: Any) -> UserMessage: # ty fixed = mapping.get(key) if fixed is None: raise RuntimeError( - "fixed-first-user fixture does not cover this TAU-2 scenario: " - f"sha256={key}" + f"fixed-first-user fixture does not cover this TAU-2 scenario: sha256={key}" ) _append_incoming_user_context(message, state) return UserMessage(role="user", content=fixed) @@ -454,9 +455,15 @@ def _register_memory_agent(args: argparse.Namespace, trace_path: Path) -> None: from tau2.registry import registry from tau2.utils.llm_utils import generate + scope_prompt = "" + if args.scope_prompt_file is not None: + scope_prompt = args.scope_prompt_file.read_text(encoding="utf-8").strip() + class OpenVikingMemoryAgent(LLMAgent): def get_init_state(self, message_history=None): state = super().get_init_state(message_history) + if scope_prompt: + state.system_messages.append(SystemMessage(role="system", content=scope_prompt)) if args.retrieval_mode in {"first_user", "first_user_prewrite"}: state.system_messages.append( SystemMessage(role="system", content="") @@ -469,9 +476,7 @@ def _retrieve( client = _client(args) rows: list[dict[str, Any]] = [] try: - result = client.search( - query=query, target_uri=args.search_uri, limit=search_limit - ) + result = client.search(query=query, target_uri=args.search_uri, limit=search_limit) memories = list(getattr(result, "memories", []) or []) blocks = [] for index, match in enumerate(memories[:search_limit], 1): @@ -705,6 +710,7 @@ def main() -> int: parser.add_argument("--prewrite-retrieval-top-k", type=int) parser.add_argument("--prewrite-inject-top-k", type=int) parser.add_argument("--fixed-first-user-file", type=Path) + parser.add_argument("--scope-prompt-file", type=Path) parser.add_argument( "--retrieval-mode", choices=["first_user", "prewrite", "first_user_prewrite"], @@ -750,6 +756,10 @@ def main() -> int: args.prewrite_inject_top_k = args.prewrite_inject_top_k or args.prewrite_retrieval_top_k if args.fixed_first_user_file is not None: args.fixed_first_user_file = args.fixed_first_user_file.expanduser().resolve() + if args.scope_prompt_file is not None: + args.scope_prompt_file = args.scope_prompt_file.expanduser().resolve() + if not args.scope_prompt_file.is_file(): + parser.error(f"--scope-prompt-file does not exist: {args.scope_prompt_file}") train_results = corpus_dir / "train_results.json" corpus_manifest = corpus_dir / "corpus_manifest.json" eval_results = args.run_dir / f"{args.run_label}.json" @@ -850,6 +860,7 @@ def main() -> int: "fixed_first_user_file": str(args.fixed_first_user_file) if args.fixed_first_user_file else None, + "scope_prompt_file": str(args.scope_prompt_file) if args.scope_prompt_file else None, "corpus": corpus, "eval_results": str(eval_results), "retrieval_trace": str(trace_path), From 74c18db57c3c382eee0f926190c7cb07ef4af9c1 Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Thu, 14 May 2026 17:34:44 +0800 Subject: [PATCH 39/42] style(benchmark): format tau2 eval runner --- benchmark/tau2/scripts/run_eval.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/benchmark/tau2/scripts/run_eval.py b/benchmark/tau2/scripts/run_eval.py index 3ecec3fef0..cf0ed2e028 100755 --- a/benchmark/tau2/scripts/run_eval.py +++ b/benchmark/tau2/scripts/run_eval.py @@ -682,7 +682,9 @@ def weighted(rows_for_group: list[dict[str, Any]]) -> dict[str, Any]: } -def _execute_cell(cell: dict[str, Any], repo: Path, out: Path, cell_timeout: int | None) -> dict[str, Any]: +def _execute_cell( + cell: dict[str, Any], repo: Path, out: Path, cell_timeout: int | None +) -> dict[str, Any]: cell_result_path = out / "cell_results" / f"{cell['run_label']}.json" if cell_result_path.is_file(): existing_row = json.loads(cell_result_path.read_text(encoding="utf-8")) @@ -766,8 +768,7 @@ def _execute_cells(plan: dict[str, Any], repo: Path, out: Path) -> list[dict[str rows: list[dict[str, Any]] = [] with ThreadPoolExecutor(max_workers=worker_count) as executor: futures = { - executor.submit(_execute_cell, cell, repo, out, cell_timeout): cell - for cell in cells + executor.submit(_execute_cell, cell, repo, out, cell_timeout): cell for cell in cells } for future in as_completed(futures): rows.append(future.result()) From b8884cfed90a40f2210aeb437f7db8878b5bf919 Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Thu, 14 May 2026 17:39:32 +0800 Subject: [PATCH 40/42] style(benchmark): format tau2 category rerank --- benchmark/tau2/scripts/category_rerank.py | 32 +++++++++++++------- benchmark/tau2/scripts/run_eval.py | 4 +-- benchmark/tau2/scripts/run_memory_v2_eval.py | 24 +++++++-------- 3 files changed, 33 insertions(+), 27 deletions(-) diff --git a/benchmark/tau2/scripts/category_rerank.py b/benchmark/tau2/scripts/category_rerank.py index 6045b4c696..8e03b0cd0a 100644 --- a/benchmark/tau2/scripts/category_rerank.py +++ b/benchmark/tau2/scripts/category_rerank.py @@ -270,7 +270,9 @@ def select( has_positive_match = positive_level in {"category1", "category2"} if has_positive_match: before_count = len(sorted_scored) - guarded = [item for item in sorted_scored if not item[3].get("category_explicit_mismatch")] + guarded = [ + item for item in sorted_scored if not item[3].get("category_explicit_mismatch") + ] if guarded: filtered = guarded decision = "soft_reranked_with_mismatch_guard" @@ -328,7 +330,9 @@ def _loaded_files(load_report: dict[str, Any]) -> list[str]: return [] -def _load_catalog(raw_path: Any, *, repo_root: Path) -> tuple[dict[str, list[CategoryEntry]], dict[str, Any]]: +def _load_catalog( + raw_path: Any, *, repo_root: Path +) -> tuple[dict[str, list[CategoryEntry]], dict[str, Any]]: report = { "path": None, "loaded": False, @@ -463,14 +467,18 @@ def _candidate_score( if (query_c1 or query_c2) and not (memory_c1 or memory_c2): score -= 2.0 reasons.append("missing_memory_category") - return score, reasons, { - "category1_match": category1_match, - "category2_match": category2_match, - "category_explicit_mismatch": bool( - (query_c1 and memory_c1 and not category1_match) - or (query_c2 and memory_c2 and not category2_match) - ), - } + return ( + score, + reasons, + { + "category1_match": category1_match, + "category2_match": category2_match, + "category_explicit_mismatch": bool( + (query_c1 and memory_c1 and not category1_match) + or (query_c2 and memory_c2 and not category2_match) + ), + }, + ) def _row_key(row: dict[str, Any]) -> str: @@ -494,7 +502,9 @@ def _mark_selected( traced = _public_row(row) traced["raw_rank"] = index traced["selected_for_injection"] = key in selected_keys - traced["injected"] = bool(traced["selected_for_injection"] and int(row.get("text_chars") or 0) > 0) + traced["injected"] = bool( + traced["selected_for_injection"] and int(row.get("text_chars") or 0) > 0 + ) if not traced["selected_for_injection"]: traced["skipped_reason"] = ( "category_rerank_inject_limit" if key in kept_keys else "category_rerank" diff --git a/benchmark/tau2/scripts/run_eval.py b/benchmark/tau2/scripts/run_eval.py index e1c09acae3..b0d9365a7a 100755 --- a/benchmark/tau2/scripts/run_eval.py +++ b/benchmark/tau2/scripts/run_eval.py @@ -761,9 +761,7 @@ def _summarize(rows: list[dict[str, Any]]) -> dict[str, Any]: def weighted(rows_for_group: list[dict[str, Any]]) -> dict[str, Any]: metric_rows = [row for row in rows_for_group if row.get("metrics")] valid_metric_rows = [row for row in metric_rows if _row_is_valid_evidence(row)] - diagnostic_rows = [ - row for row in metric_rows if not _row_is_valid_evidence(row) - ] + diagnostic_rows = [row for row in metric_rows if not _row_is_valid_evidence(row)] diagnostic_reason_counts: Counter[str] = Counter() for row in diagnostic_rows: evidence = row.get("runtime_evidence") diff --git a/benchmark/tau2/scripts/run_memory_v2_eval.py b/benchmark/tau2/scripts/run_memory_v2_eval.py index b035bb58ac..77a48cbbbf 100644 --- a/benchmark/tau2/scripts/run_memory_v2_eval.py +++ b/benchmark/tau2/scripts/run_memory_v2_eval.py @@ -245,7 +245,9 @@ def _trace_category_summary(trace_path: Path) -> dict[str, Any]: if isinstance(call, dict) and call.get("name"): tool_calls[str(call["name"])] += 1 - category = row.get("category_rerank") if isinstance(row.get("category_rerank"), dict) else {} + category = ( + row.get("category_rerank") if isinstance(row.get("category_rerank"), dict) else {} + ) if category: category_event_count += 1 if category.get("enabled"): @@ -404,9 +406,8 @@ def _runtime_evidence_status( if int(corpus_probe.get("match_count") or 0) > 0: if int(corpus_probe.get("concrete_match_count") or 0) <= 0: reasons.append("no_concrete_corpus_probe_matches") - if ( - int(corpus_probe.get("aggregate_match_count") or 0) - == int(corpus_probe.get("match_count") or 0) + if int(corpus_probe.get("aggregate_match_count") or 0) == int( + corpus_probe.get("match_count") or 0 ): reasons.append("aggregate_only_corpus_probe") @@ -446,15 +447,13 @@ def _runtime_evidence_status( reasons.append("no_injected_concrete_memory") if ( int(counts.get("query_category_matched_event_count") or 0) > 0 - and float(rates.get("selected_positive_category_match_rate") or 0.0) - <= 0.0 + and float(rates.get("selected_positive_category_match_rate") or 0.0) <= 0.0 ): reasons.append("no_selected_positive_category_match") if ( int(counts.get("query_category_matched_event_count") or 0) > 0 and int(counts.get("memory_injection_event_count") or 0) > 0 - and int(counts.get("injected_concrete_positive_category_match_count") or 0) - <= 0 + and int(counts.get("injected_concrete_positive_category_match_count") or 0) <= 0 ): reasons.append("no_injected_concrete_positive_category_match") @@ -926,10 +925,7 @@ def _trace_injection_fields(block: str, matches: list[dict[str, Any]]) -> dict[s 1 for row in matches if row.get("injected") - or ( - row.get("selected_for_injection", True) - and int(row.get("text_chars") or 0) > 0 - ) + or (row.get("selected_for_injection", True) and int(row.get("text_chars") or 0) > 0) ) return { "injected": bool(block.strip()), @@ -1197,7 +1193,9 @@ def main() -> int: if not args.scope_prompt_file.is_file(): parser.error(f"--scope-prompt-file does not exist: {args.scope_prompt_file}") if isinstance(args.scope_prompt_config, dict) and args.scope_prompt_config.get("enabled"): - parser.error("--scope-prompt-file and enabled --scope-prompt-config are mutually exclusive") + parser.error( + "--scope-prompt-file and enabled --scope-prompt-config are mutually exclusive" + ) args.scope_prompt_config = { "enabled": True, "domain_files": {args.domain: str(args.scope_prompt_file)}, From 8648c5d201746ed8e1228aa9142a005bb7edb447 Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Thu, 14 May 2026 17:43:38 +0800 Subject: [PATCH 41/42] bench(tau2): add first-user category diagnostic config --- .../custom_s84_category_first_user.yaml | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 benchmark/tau2/config/custom_s84_category_first_user.yaml diff --git a/benchmark/tau2/config/custom_s84_category_first_user.yaml b/benchmark/tau2/config/custom_s84_category_first_user.yaml new file mode 100644 index 0000000000..a22f8c932f --- /dev/null +++ b/benchmark/tau2/config/custom_s84_category_first_user.yaml @@ -0,0 +1,42 @@ +extends: custom_s84_category.yaml + +benchmark: + name: tau2_openviking_custom_s84_scope_category_first_user + +strategies: + - id: custom_s84_scope_category_first_user_positive_match + label: Custom procedure L2 S84 scope prompt + first-user/prewrite category positive-match + memory_backend: openviking + train_required: false + corpus_id: custom_procedure_steps_l2_20260510 + train_memory_mode: external_procedure_l2 + search_memory_type: procedures + retrieval_mode: first_user_prewrite + external_openviking: + retail: + url: ${OPENVIKING_CUSTOM_URL:-http://localhost:1950} + account: agent-harness-custom-procedure-steps-retail-20260510 + user: tau2-retail-custom-procedure-steps-user-20260510 + agent_id: tau2-retail-custom-procedure-steps-agent-20260510 + search_uri: viking://agent/tau2-retail-custom-procedure-steps-agent-20260510/memories/procedures/retail + airline: + url: ${OPENVIKING_CUSTOM_URL:-http://localhost:1950} + account: agent-harness-custom-procedure-steps-airline-20260510 + user: tau2-airline-custom-procedure-steps-user-20260510 + agent_id: tau2-airline-custom-procedure-steps-agent-20260510 + search_uri: viking://agent/tau2-airline-custom-procedure-steps-agent-20260510/memories/procedures/airline + scope_prompt_files: + retail: benchmark/tau2/config/scope_prompts/retail_memory_scope.md + airline: benchmark/tau2/config/scope_prompts/airline_memory_scope.md + category_rerank: + enabled: true + catalog_path: benchmark/tau2/config/category_catalog.json + apply_nodes: + - first_user + - before_write_tool_call + retrieve_limit: 6 + inject_limit: 2 + mismatch_policy: keep_positive_match_drop_mismatch + positive_match_required: true + no_match_policy: skip_injection + search_score_weight: 0.0 From 8cf7737cd7c7aa5f3850f9f5b02c20117b019799 Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Thu, 14 May 2026 17:46:58 +0800 Subject: [PATCH 42/42] style(benchmark): satisfy tau2 eval lint --- benchmark/tau2/scripts/run_eval.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/benchmark/tau2/scripts/run_eval.py b/benchmark/tau2/scripts/run_eval.py index cf0ed2e028..90f5bc97a4 100755 --- a/benchmark/tau2/scripts/run_eval.py +++ b/benchmark/tau2/scripts/run_eval.py @@ -17,12 +17,11 @@ load_config, normalize_litellm_env, output_dir, - run_id, resolve_path, + run_id, simulator_policy_report, split_file, strategy_ids, - tau2_cli, tau2_context, tau2_repo, user_simulator_policy,