r266-tech
diff --git a/‎benchmark/tau2/.gitignore‎ ‎benchmark/tau2/llm/.gitignore‎benchmark/tau2/.gitignore renamed to benchmark/tau2/llm/.gitignore b/‎benchmark/tau2/.gitignore‎ ‎benchmark/tau2/llm/.gitignore‎benchmark/tau2/.gitignore renamed to benchmark/tau2/llm/.gitignore
diff --git a/‎benchmark/tau2/README.md‎ ‎benchmark/tau2/llm/README.md‎benchmark/tau2/README.md renamed to benchmark/tau2/llm/README.md
Lines changed: 35 additions & 35 deletions b/‎benchmark/tau2/README.md‎ ‎benchmark/tau2/llm/README.md‎benchmark/tau2/README.md renamed to benchmark/tau2/llm/README.md
Lines changed: 35 additions & 35 deletions
diff --git a/‎benchmark/tau2/config/baseline.yaml‎ ‎benchmark/tau2/llm/config/baseline.yaml‎benchmark/tau2/config/baseline.yaml renamed to benchmark/tau2/llm/config/baseline.yaml
Lines changed: 2 additions & 2 deletions b/‎benchmark/tau2/config/baseline.yaml‎ ‎benchmark/tau2/llm/config/baseline.yaml‎benchmark/tau2/config/baseline.yaml renamed to benchmark/tau2/llm/config/baseline.yaml
Lines changed: 2 additions & 2 deletions
diff --git a/‎…2/config/fixed_first_user_bootstrap.yaml‎ ‎…m/config/fixed_first_user_bootstrap.yaml‎benchmark/tau2/config/fixed_first_user_bootstrap.yaml renamed to benchmark/tau2/llm/config/fixed_first_user_bootstrap.yaml b/‎…2/config/fixed_first_user_bootstrap.yaml‎ ‎…m/config/fixed_first_user_bootstrap.yaml‎benchmark/tau2/config/fixed_first_user_bootstrap.yaml renamed to benchmark/tau2/llm/config/fixed_first_user_bootstrap.yaml
diff --git a/‎benchmark/tau2/config/no_memory.yaml‎ ‎benchmark/tau2/llm/config/no_memory.yaml‎benchmark/tau2/config/no_memory.yaml renamed to benchmark/tau2/llm/config/no_memory.yaml b/‎benchmark/tau2/config/no_memory.yaml‎ ‎benchmark/tau2/llm/config/no_memory.yaml‎benchmark/tau2/config/no_memory.yaml renamed to benchmark/tau2/llm/config/no_memory.yaml
diff --git a/‎benchmark/tau2/config/official.yaml‎ ‎benchmark/tau2/llm/config/official.yaml‎benchmark/tau2/config/official.yaml renamed to benchmark/tau2/llm/config/official.yaml b/‎benchmark/tau2/config/official.yaml‎ ‎benchmark/tau2/llm/config/official.yaml‎benchmark/tau2/config/official.yaml renamed to benchmark/tau2/llm/config/official.yaml
diff --git a/‎…onfig/prb_content_matrix_new_prompt.yaml‎ ‎…onfig/prb_content_matrix_new_prompt.yaml‎benchmark/tau2/config/prb_content_matrix_new_prompt.yaml renamed to benchmark/tau2/llm/config/prb_content_matrix_new_prompt.yaml
Lines changed: 8 additions & 8 deletions b/‎…onfig/prb_content_matrix_new_prompt.yaml‎ ‎…onfig/prb_content_matrix_new_prompt.yaml‎benchmark/tau2/config/prb_content_matrix_new_prompt.yaml renamed to benchmark/tau2/llm/config/prb_content_matrix_new_prompt.yaml
Lines changed: 8 additions & 8 deletions
diff --git a/‎…mark/tau2/config/prb_scope_fairness.yaml‎ ‎…/tau2/llm/config/prb_scope_fairness.yaml‎benchmark/tau2/config/prb_scope_fairness.yaml renamed to benchmark/tau2/llm/config/prb_scope_fairness.yaml
Lines changed: 2 additions & 2 deletions b/‎…mark/tau2/config/prb_scope_fairness.yaml‎ ‎…/tau2/llm/config/prb_scope_fairness.yaml‎benchmark/tau2/config/prb_scope_fairness.yaml renamed to benchmark/tau2/llm/config/prb_scope_fairness.yaml
Lines changed: 2 additions & 2 deletions
diff --git a/‎benchmark/tau2/config/prewrite.yaml‎ ‎benchmark/tau2/llm/config/prewrite.yaml‎benchmark/tau2/config/prewrite.yaml renamed to benchmark/tau2/llm/config/prewrite.yaml b/‎benchmark/tau2/config/prewrite.yaml‎ ‎benchmark/tau2/llm/config/prewrite.yaml‎benchmark/tau2/config/prewrite.yaml renamed to benchmark/tau2/llm/config/prewrite.yaml
diff --git a/‎…ig/scope_prompts/generic_memory_scope.md‎ ‎…ig/scope_prompts/generic_memory_scope.md‎benchmark/tau2/config/scope_prompts/generic_memory_scope.md renamed to benchmark/tau2/llm/config/scope_prompts/generic_memory_scope.md b/‎…ig/scope_prompts/generic_memory_scope.md‎ ‎…ig/scope_prompts/generic_memory_scope.md‎benchmark/tau2/config/scope_prompts/generic_memory_scope.md renamed to benchmark/tau2/llm/config/scope_prompts/generic_memory_scope.md
@@ -12,7 +12,7 @@ Category rerank and other harness-only diagnostics are intentionally left out.
 ## Layout
 
 ```text
-benchmark/tau2/
+benchmark/tau2/llm/
 ├── config/
 │   ├── baseline.yaml
 │   ├── official.yaml
@@ -25,9 +25,9 @@ benchmark/tau2/
 └── run_full_eval.sh
 ```
 
-Generated eval artifacts are written to `benchmark/tau2/result/<run_id>/`.
+Generated eval artifacts are written to `benchmark/tau2/llm/result/<run_id>/`.
 Memory corpus artifacts are cached outside the run id at
-`benchmark/tau2/result/memory_corpora/` by default.
+`benchmark/tau2/llm/result/memory_corpora/` by default.
 
 ## Quick Start
 
@@ -59,26 +59,26 @@ For a local one-command setup, clone and install TAU-2 into ignored benchmark
 directories:
 
 ```bash
-benchmark/tau2/scripts/setup_tau2_repo.sh
-source benchmark/tau2/.env.tau2
+benchmark/tau2/llm/scripts/setup_tau2_repo.sh
+source benchmark/tau2/llm/.env.tau2
 ```
 
 For PR-B-compatible reproduction, pin the TAU-2 checkout to a ref that includes
 the confirmation-aware text-user-simulator prompt. The original PR-B evidence
 used the open TAU-2 fix PR head (`79dbf0c18ac7637aedf869cb3122babcd57aaf17`):
 
 ```bash
-benchmark/tau2/scripts/setup_tau2_repo.sh \
+benchmark/tau2/llm/scripts/setup_tau2_repo.sh \
   --ref refs/pull/297/head
-source benchmark/tau2/.env.tau2
+source benchmark/tau2/llm/.env.tau2
 ```
 
 Reference: [sierra-research/tau2-bench#297](https://github.com/sierra-research/tau2-bench/pull/297).
 
 Plan the default benchmark without running TAU-2:
 
 ```bash
-python benchmark/tau2/scripts/run_eval.py --config benchmark/tau2/config/baseline.yaml --plan-only
+python benchmark/tau2/llm/scripts/run_eval.py --config benchmark/tau2/llm/config/baseline.yaml --plan-only
 ```
 
 Add `--preflight` or `--strict-preflight` when you want the runner to write a
@@ -87,8 +87,8 @@ small environment/config check next to the run plan.
 After setup, verify the local TAU-2 link and write a one-cell run plan:
 
 ```bash
-benchmark/tau2/run_full_eval.sh \
-  --config benchmark/tau2/config/baseline.yaml \
+benchmark/tau2/llm/run_full_eval.sh \
+  --config benchmark/tau2/llm/config/baseline.yaml \
   --strict-preflight \
   --domain retail \
   --strategy-id memory_v2_experience_only \
@@ -99,8 +99,8 @@ benchmark/tau2/run_full_eval.sh \
 Plan a one-cell Memory V2 pre-write smoke:
 
 ```bash
-benchmark/tau2/run_full_eval.sh \
-  --config benchmark/tau2/config/baseline.yaml \
+benchmark/tau2/llm/run_full_eval.sh \
+  --config benchmark/tau2/llm/config/baseline.yaml \
   --domain retail \
   --strategy-id memory_v2_prewrite \
   --num-tasks 1 \
@@ -110,8 +110,8 @@ benchmark/tau2/run_full_eval.sh \
 Plan a one-cell trajectory memory smoke:
 
 ```bash
-benchmark/tau2/run_full_eval.sh \
-  --config benchmark/tau2/config/trajectory.yaml \
+benchmark/tau2/llm/run_full_eval.sh \
+  --config benchmark/tau2/llm/config/trajectory.yaml \
   --domain retail \
   --strategy-id memory_v2_trajectory_view \
   --num-tasks 1 \
@@ -122,8 +122,8 @@ benchmark/tau2/run_full_eval.sh \
 Run the Memory V2 8-trial matrix (`retail + airline` x 2 strategies x 8 repeats):
 
 ```bash
-benchmark/tau2/run_full_eval.sh \
-  --config benchmark/tau2/config/baseline.yaml \
+benchmark/tau2/llm/run_full_eval.sh \
+  --config benchmark/tau2/llm/config/baseline.yaml \
   --execute
 ```
 
@@ -143,15 +143,15 @@ the same confirmation-aware simulator policy but does not require fixed fixtures
 Run one bootstrap pass per domain:
 
 ```bash
-benchmark/tau2/run_full_eval.sh \
-  --config benchmark/tau2/config/fixed_first_user_bootstrap.yaml \
+benchmark/tau2/llm/run_full_eval.sh \
+  --config benchmark/tau2/llm/config/fixed_first_user_bootstrap.yaml \
   --domain retail \
   --run-id fixed_first_user_bootstrap_retail \
   --strict-preflight \
   --execute
 
-benchmark/tau2/run_full_eval.sh \
-  --config benchmark/tau2/config/fixed_first_user_bootstrap.yaml \
+benchmark/tau2/llm/run_full_eval.sh \
+  --config benchmark/tau2/llm/config/fixed_first_user_bootstrap.yaml \
   --domain airline \
   --run-id fixed_first_user_bootstrap_airline \
   --strict-preflight \
@@ -161,40 +161,40 @@ benchmark/tau2/run_full_eval.sh \
 Then convert each bootstrap `results.json` into a fixture:
 
 ```bash
-RETAIL_RESULTS=benchmark/tau2/result/fixed_first_user_bootstrap_retail/memory_cells/fixed_first_user_bootstrap_retail_retail_no_memory_r1/fixed_first_user_bootstrap_retail_retail_no_memory_r1.json
-AIRLINE_RESULTS=benchmark/tau2/result/fixed_first_user_bootstrap_airline/memory_cells/fixed_first_user_bootstrap_airline_airline_no_memory_r1/fixed_first_user_bootstrap_airline_airline_no_memory_r1.json
+RETAIL_RESULTS=benchmark/tau2/llm/result/fixed_first_user_bootstrap_retail/memory_cells/fixed_first_user_bootstrap_retail_retail_no_memory_r1/fixed_first_user_bootstrap_retail_retail_no_memory_r1.json
+AIRLINE_RESULTS=benchmark/tau2/llm/result/fixed_first_user_bootstrap_airline/memory_cells/fixed_first_user_bootstrap_airline_airline_no_memory_r1/fixed_first_user_bootstrap_airline_airline_no_memory_r1.json
 
-python benchmark/tau2/scripts/build_fixed_first_user_fixture.py \
+python benchmark/tau2/llm/scripts/build_fixed_first_user_fixture.py \
   --repo "$TAU2_REPO" \
   --results-json "$RETAIL_RESULTS" \
   --domain retail \
   --task-split-name test \
-  --output benchmark/tau2/result/fixed_first_user_fixtures/retail/fixed_first_user_fixture.json \
+  --output benchmark/tau2/llm/result/fixed_first_user_fixtures/retail/fixed_first_user_fixture.json \
   --require-full-split
 
-python benchmark/tau2/scripts/build_fixed_first_user_fixture.py \
+python benchmark/tau2/llm/scripts/build_fixed_first_user_fixture.py \
   --repo "$TAU2_REPO" \
   --results-json "$AIRLINE_RESULTS" \
   --domain airline \
   --task-split-name test \
-  --output benchmark/tau2/result/fixed_first_user_fixtures/airline/fixed_first_user_fixture.json \
+  --output benchmark/tau2/llm/result/fixed_first_user_fixtures/airline/fixed_first_user_fixture.json \
   --require-full-split
 ```
 
 Export the generated fixture paths for subsequent strict runs:
 
 ```bash
-export TAU2_RETAIL_FIXED_FIRST_USER_FILE="$PWD/benchmark/tau2/result/fixed_first_user_fixtures/retail/fixed_first_user_fixture.json"
-export TAU2_AIRLINE_FIXED_FIRST_USER_FILE="$PWD/benchmark/tau2/result/fixed_first_user_fixtures/airline/fixed_first_user_fixture.json"
+export TAU2_RETAIL_FIXED_FIRST_USER_FILE="$PWD/benchmark/tau2/llm/result/fixed_first_user_fixtures/retail/fixed_first_user_fixture.json"
+export TAU2_AIRLINE_FIXED_FIRST_USER_FILE="$PWD/benchmark/tau2/llm/result/fixed_first_user_fixtures/airline/fixed_first_user_fixture.json"
 ```
 
 ### 2. Run smoke and full PR-B matrix
 
 First run one tiny end-to-end smoke against a clean local OpenViking service:
 
 ```bash
-benchmark/tau2/run_full_eval.sh \
-  --config benchmark/tau2/config/prb_content_matrix_new_prompt.yaml \
+benchmark/tau2/llm/run_full_eval.sh \
+  --config benchmark/tau2/llm/config/prb_content_matrix_new_prompt.yaml \
   --domain retail \
   --strategy-id new_traj_fixed_first_user_prewrite \
   --num-tasks 1 \
@@ -207,24 +207,24 @@ benchmark/tau2/run_full_eval.sh \
 Then run the full PR-B matrix:
 
 ```bash
-benchmark/tau2/run_full_eval.sh \
-  --config benchmark/tau2/config/prb_content_matrix_new_prompt.yaml \
+benchmark/tau2/llm/run_full_eval.sh \
+  --config benchmark/tau2/llm/config/prb_content_matrix_new_prompt.yaml \
   --run-id prb_content_matrix_new_prompt_full8 \
   --strict-preflight \
   --execute
 ```
 
 The main result is written to
-`benchmark/tau2/result/prb_content_matrix_new_prompt_full8/scoreboard.json`.
+`benchmark/tau2/llm/result/prb_content_matrix_new_prompt_full8/scoreboard.json`.
 Per-cell execution records live under `cell_results/`, raw TAU-2 result JSON
 lives under `memory_cells/`, and corpus identity / generated memory checks live
 under `memory_corpora/`.
 
 For a small E2E smoke, keep both the eval and train slices tiny:
 
 ```bash
-benchmark/tau2/run_full_eval.sh \
-  --config benchmark/tau2/config/baseline.yaml \
+benchmark/tau2/llm/run_full_eval.sh \
+  --config benchmark/tau2/llm/config/baseline.yaml \
   --domain retail \
   --strategy-id memory_v2_experience_only \
   --num-tasks 1 \
 
@@ -18,10 +18,10 @@ benchmark:
 paths:
   tau2_repo: ${TAU2_REPO:-data/external_benchmarks/tau2-bench}
   tau2_cli: ${TAU2_CLI:-tau2}
-  output_dir: benchmark/tau2/result
+  output_dir: benchmark/tau2/llm/result
   # Corpus writes are expensive and should be reused across eval run ids when
   # the train split and memory prompt/config did not change.
-  corpus_cache_dir: benchmark/tau2/result/memory_corpora
+  corpus_cache_dir: benchmark/tau2/llm/result/memory_corpora
 
 eval:
   # Default OpenViking TAU-2 memory evidence uses the fixed-first-user full8
 
@@ -24,7 +24,7 @@ strategies:
     retrieval_mode: first_user
     retrieval_top_k: 4
     first_user_inject_top_k: 4
-    scope_prompt_file: benchmark/tau2/config/scope_prompts/generic_memory_scope.md
+    scope_prompt_file: benchmark/tau2/llm/config/scope_prompts/generic_memory_scope.md
 
   - id: new_traj_fixed_prewrite_only
     label: PR-B new trajectory fixed-count prewrite top2
@@ -40,7 +40,7 @@ strategies:
     retrieval_top_k: 4
     prewrite_retrieval_top_k: 2
     prewrite_inject_top_k: 2
-    scope_prompt_file: benchmark/tau2/config/scope_prompts/generic_memory_scope.md
+    scope_prompt_file: benchmark/tau2/llm/config/scope_prompts/generic_memory_scope.md
 
   - id: new_traj_fixed_first_user_prewrite
     label: PR-B new trajectory fixed-count first-user top4 + prewrite top2
@@ -57,7 +57,7 @@ strategies:
     first_user_inject_top_k: 4
     prewrite_retrieval_top_k: 2
     prewrite_inject_top_k: 2
-    scope_prompt_file: benchmark/tau2/config/scope_prompts/generic_memory_scope.md
+    scope_prompt_file: benchmark/tau2/llm/config/scope_prompts/generic_memory_scope.md
 
   - id: new_exp_fixed_first_user
     label: PR-B new experience fixed-count first-user top2
@@ -72,7 +72,7 @@ strategies:
     retrieval_mode: first_user
     retrieval_top_k: 2
     first_user_inject_top_k: 2
-    scope_prompt_file: benchmark/tau2/config/scope_prompts/generic_memory_scope.md
+    scope_prompt_file: benchmark/tau2/llm/config/scope_prompts/generic_memory_scope.md
 
   - id: new_exp_fixed_prewrite_only
     label: PR-B new experience fixed-count prewrite top2
@@ -88,7 +88,7 @@ strategies:
     retrieval_top_k: 2
     prewrite_retrieval_top_k: 2
     prewrite_inject_top_k: 2
-    scope_prompt_file: benchmark/tau2/config/scope_prompts/generic_memory_scope.md
+    scope_prompt_file: benchmark/tau2/llm/config/scope_prompts/generic_memory_scope.md
 
   - id: new_exp_fixed_first_user_prewrite
     label: PR-B new experience fixed-count first-user + prewrite top2
@@ -105,7 +105,7 @@ strategies:
     first_user_inject_top_k: 2
     prewrite_retrieval_top_k: 2
     prewrite_inject_top_k: 2
-    scope_prompt_file: benchmark/tau2/config/scope_prompts/generic_memory_scope.md
+    scope_prompt_file: benchmark/tau2/llm/config/scope_prompts/generic_memory_scope.md
 
   - id: new_traj_4000_prewrite_only
     label: PR-B new trajectory 4000-char prewrite
@@ -122,7 +122,7 @@ strategies:
     prewrite_retrieval_top_k: 8
     prewrite_inject_top_k: 8
     memory_inject_max_chars: 4000
-    scope_prompt_file: benchmark/tau2/config/scope_prompts/generic_memory_scope.md
+    scope_prompt_file: benchmark/tau2/llm/config/scope_prompts/generic_memory_scope.md
 
   - id: new_exp_4000_first_user_prewrite
     label: PR-B new experience 4000-char first-user + prewrite
@@ -140,4 +140,4 @@ strategies:
     prewrite_retrieval_top_k: 8
     prewrite_inject_top_k: 8
     memory_inject_max_chars: 4000
-    scope_prompt_file: benchmark/tau2/config/scope_prompts/generic_memory_scope.md
+    scope_prompt_file: benchmark/tau2/llm/config/scope_prompts/generic_memory_scope.md
@@ -17,7 +17,7 @@ strategies:
   - id: no_memory_generic_scope
     label: TAU-2 no-memory same-seed baseline with generic memory scope prompt
     memory_backend: none
-    scope_prompt_file: benchmark/tau2/config/scope_prompts/generic_memory_scope.md
+    scope_prompt_file: benchmark/tau2/llm/config/scope_prompts/generic_memory_scope.md
 
   - id: trajectory_top4_first_user_prewrite_generic_scope
     label: Trajectory top4 first-user + pre-write top2 with generic memory scope prompt
@@ -31,4 +31,4 @@ strategies:
     first_user_inject_top_k: 4
     prewrite_retrieval_top_k: 2
     prewrite_inject_top_k: 2
-    scope_prompt_file: benchmark/tau2/config/scope_prompts/generic_memory_scope.md
+    scope_prompt_file: benchmark/tau2/llm/config/scope_prompts/generic_memory_scope.md