sourcegraph
diff --git a/‎docs/technical_reports/TECHNICAL_REPORT_V1.md‎
Lines changed: 16 additions & 16 deletions b/‎docs/technical_reports/TECHNICAL_REPORT_V1.md‎
Lines changed: 16 additions & 16 deletions
diff --git a/‎runs/official/ccb_fix_haiku_20260227_151833/retrieval_events/baseline-local-direct/openlibrary-solr-boolean-fix-001.retrieval_events.json‎
Lines changed: 266 additions & 0 deletions b/‎runs/official/ccb_fix_haiku_20260227_151833/retrieval_events/baseline-local-direct/openlibrary-solr-boolean-fix-001.retrieval_events.json‎
Lines changed: 266 additions & 0 deletions
@@ -943,19 +943,19 @@ The strongest pattern in this paired slice is by `context_length`: the larger pr
 
 ### 11.6 Information Retrieval Metrics
 
-The IR evaluation pipeline (Section 8) produces file-level recall, MRR, MAP, nDCG, context efficiency, and utilization probes for tasks with ground truth file sets. Results from the full pipeline run (n=594 computable tasks out of 1,005 event files):
+The IR evaluation pipeline (Section 8) produces file-level recall, MRR, MAP, nDCG, context efficiency, and utilization probes for tasks with ground truth file sets. Results from the full pipeline run (n=618 computable tasks out of 1,084 event files):
 
 **Aggregate File-Level IR Metrics:**
 
 | Metric | Mean | Median | Std | n |
 |--------|------|--------|-----|---|
-| File Recall | 0.375 | 0.111 | 0.424 | 594 |
-| MRR | 0.347 | 0.007 | 0.443 | 594 |
-| MAP | 0.232 | 0.008 | 0.340 | 594 |
-| Context Efficiency | 0.190 | 0.013 | 0.280 | 594 |
-| Precision@1 | 0.298 | 0.000 | 0.458 | 594 |
-| Recall@5 | 0.223 | 0.000 | 0.345 | 594 |
-| nDCG@10 | 0.275 | 0.000 | 0.371 | 594 |
+| File Recall | 0.374 | 0.125 | 0.421 | 618 |
+| MRR | 0.351 | 0.009 | 0.444 | 618 |
+| MAP | 0.230 | 0.010 | 0.337 | 618 |
+| Context Efficiency | 0.192 | 0.019 | 0.279 | 618 |
+| Precision@1 | 0.301 | 0.000 | 0.459 | 618 |
+| Recall@5 | 0.220 | 0.000 | 0.342 | 618 |
+| nDCG@10 | 0.274 | 0.000 | 0.369 | 618 |
 
 **High-Confidence Subset** (medium/high-confidence ground truth, n=26):
 
@@ -967,25 +967,25 @@ The IR evaluation pipeline (Section 8) produces file-level recall, MRR, MAP, nDC
 | Context Efficiency | 0.432 | 0.287 |
 | TTFR | 24.9s | 11.1s |
 
-**Utilization Probes** (n=594):
+**Utilization Probes** (n=618):
 
 | Probe | Mean | Median |
 |-------|------|--------|
-| Read Overlap with Relevant Files | 0.337 | 0.093 |
-| Write Overlap with Relevant Files | 0.056 | 0.000 |
-| Read-Before-Write Ratio | 0.195 | 0.000 |
+| Read Overlap with Relevant Files | 0.335 | 0.111 |
+| Write Overlap with Relevant Files | 0.055 | 0.000 |
+| Read-Before-Write Ratio | 0.194 | 0.000 |
 
-**Error Taxonomy** (n=594):
+**Error Taxonomy** (n=618):
 
 | Error Type | Mean Count | Median |
 |------------|-----------|--------|
-| Irrelevant Retrieval | 39.7 | 7.0 |
+| Irrelevant Retrieval | 39.1 | 8.0 |
 | Missed Key Evidence | 5.8 | 3.0 |
 | Wrong Evidence Used | 2.2 | 1.0 |
 | Unused Correct Retrieval | 2.2 | 0.0 |
-| Ambiguity Near Miss | 17.2 | 0.0 |
+| Ambiguity Near Miss | 16.9 | 0.0 |
 
-**Retrieval-Outcome Correlation:** Spearman rho = 0.078 (p=0.737, n=26 high-confidence tasks), indicating negligible correlation between retrieval quality (MRR) and task outcome (reward) in the current sample. The wide median-mean gaps across all IR metrics reflect a bimodal distribution: agents either find the right files early (high MRR) or miss them entirely (MRR=0). The dominant retrieval strategy is file reads (364 tasks), followed by code search (115 tasks), with MCP-based retrieval accounting for 229 of 594 evidence traces.
+**Retrieval-Outcome Correlation:** Spearman rho = 0.078 (p=0.737, n=26 high-confidence tasks), indicating negligible correlation between retrieval quality (MRR) and task outcome (reward) in the current sample. The wide median-mean gaps across all IR metrics reflect a bimodal distribution: agents either find the right files early (high MRR) or miss them entirely (MRR=0). The dominant retrieval strategy is file reads (376 tasks), followed by code search (125 tasks), with MCP-based retrieval accounting for 243 of 618 evidence traces.
 
 ### 11.7 MCP Tool Usage Patterns
 
 
@@ -0,0 +1,266 @@
+{
+  "schema_version": "1.0",
+  "generated_at": "2026-02-27T17:41:30.859784+00:00",
+  "provenance": {
+    "run_id": "ccb_fix_haiku_20260227_151833",
+    "batch_timestamp": "2026-02-27__15-18-42",
+    "task_name": "openlibrary-solr-boolean-fix-001",
+    "trial_name": "openlibrary-solr-boolean-fix-001__QG7rmi9",
+    "config_name": "baseline-local-direct",
+    "benchmark": "unknown",
+    "model": "claude-haiku-4-5-20251001"
+  },
+  "coverage": {
+    "has_trajectory": true,
+    "has_transcript": true,
+    "has_ground_truth": true,
+    "has_chunk_ground_truth": false,
+    "trace_source": "merged",
+    "degraded_reason": null,
+    "ground_truth_source": "patch",
+    "ground_truth_confidence": "high"
+  },
+  "ground_truth": {
+    "files": [
+      "docker-compose.yml",
+      "openlibrary/core/bookshelves.py",
+      "openlibrary/core/models.py",
+      "openlibrary/core/ratings.py",
+      "openlibrary/macros/SearchResultsWork.html",
+      "openlibrary/macros/StarRatings.html",
+      "openlibrary/plugins/upstream/mybooks.py",
+      "openlibrary/plugins/worksearch/code.py",
+      "openlibrary/templates/account/books.html",
+      "openlibrary/templates/account/reading_log.html",
+      "openlibrary/utils/solr.py"
+    ],
+    "expected_edit_files": [
+      "docker-compose.yml",
+      "openlibrary/core/bookshelves.py",
+      "openlibrary/core/models.py",
+      "openlibrary/core/ratings.py",
+      "openlibrary/macros/SearchResultsWork.html",
+      "openlibrary/macros/StarRatings.html",
+      "openlibrary/plugins/upstream/mybooks.py",
+      "openlibrary/plugins/worksearch/code.py",
+      "openlibrary/templates/account/books.html",
+      "openlibrary/templates/account/reading_log.html",
+      "openlibrary/utils/solr.py"
+    ],
+    "expected_edit_files_source": "patch",
+    "expected_edit_files_confidence": "high"
+  },
+  "events": [
+    {
+      "step_index": 10,
+      "timestamp": "2026-02-27T15:38:49.757Z",
+      "tool_name": "Read",
+      "tool_category": "file_read",
+      "is_mcp": false,
+      "arguments": {
+        "file_path": "/workspace/tests/test_docker_compose.py"
+      },
+      "target_files": [
+        "tests/test_docker_compose.py"
+      ],
+      "target_symbols": [],
+      "hits_ground_truth": false,
+      "matched_ground_truth_files": [],
+      "is_subagent": false,
+      "cumulative_tokens": 48191,
+      "elapsed_seconds": 74.6
+    },
+    {
+      "step_index": 12,
+      "timestamp": "2026-02-27T15:38:55.048Z",
+      "tool_name": "Read",
+      "tool_category": "file_read",
+      "is_mcp": false,
+      "arguments": {
+        "file_path": "/workspace/compose.yaml"
+      },
+      "target_files": [
+        "compose.yaml"
+      ],
+      "target_symbols": [],
+      "hits_ground_truth": false,
+      "matched_ground_truth_files": [],
+      "is_subagent": false,
+      "cumulative_tokens": 51836,
+      "elapsed_seconds": 79.9
+    },
+    {
+      "step_index": 13,
+      "timestamp": "2026-02-27T15:38:58.085Z",
+      "tool_name": "Read",
+      "tool_category": "file_read",
+      "is_mcp": false,
+      "arguments": {
+        "file_path": "/workspace/openlibrary/core/bookshelves.py"
+      },
+      "target_files": [
+        "openlibrary/core/bookshelves.py"
+      ],
+      "target_symbols": [],
+      "hits_ground_truth": true,
+      "matched_ground_truth_files": [
+        "openlibrary/core/bookshelves.py"
+      ],
+      "is_subagent": false,
+      "cumulative_tokens": 52551,
+      "elapsed_seconds": 82.9
+    },
+    {
+      "step_index": 22,
+      "timestamp": "2026-02-27T15:41:29.806Z",
+      "tool_name": "Read",
+      "tool_category": "file_read",
+      "is_mcp": false,
+      "arguments": {
+        "file_path": "/workspace/pyproject.toml"
+      },
+      "target_files": [
+        "pyproject.toml"
+      ],
+      "target_symbols": [],
+      "hits_ground_truth": false,
+      "matched_ground_truth_files": [],
+      "is_subagent": false,
+      "cumulative_tokens": 87628,
+      "elapsed_seconds": 234.6
+    },
+    {
+      "step_index": 26,
+      "timestamp": "2026-02-27T15:41:51.029Z",
+      "tool_name": "Read",
+      "tool_category": "file_read",
+      "is_mcp": false,
+      "arguments": {
+        "file_path": "/workspace/scripts/solr_builder/compose.yaml"
+      },
+      "target_files": [
+        "scripts/solr_builder/compose.yaml"
+      ],
+      "target_symbols": [],
+      "hits_ground_truth": false,
+      "matched_ground_truth_files": [],
+      "is_subagent": false,
+      "cumulative_tokens": 99767,
+      "elapsed_seconds": 255.9
+    },
+    {
+      "step_index": 28,
+      "timestamp": "2026-02-27T15:42:07.263Z",
+      "tool_name": "Read",
+      "tool_category": "file_read",
+      "is_mcp": false,
+      "arguments": {
+        "file_path": "/workspace/scripts/solr_builder/compose.yaml"
+      },
+      "target_files": [
+        "scripts/solr_builder/compose.yaml"
+      ],
+      "target_symbols": [],
+      "hits_ground_truth": false,
+      "matched_ground_truth_files": [],
+      "is_subagent": false,
+      "cumulative_tokens": 107703,
+      "elapsed_seconds": 272.1
+    },
+    {
+      "step_index": 29,
+      "timestamp": "2026-02-27T15:42:10.637Z",
+      "tool_name": "Edit",
+      "tool_category": "file_write",
+      "is_mcp": false,
+      "arguments": {
+        "file_path": "/workspace/scripts/solr_builder/compose.yaml",
+        "old_string": "      - SOLR_OPTS=\n        -Dsolr.autoCommit.maxTime=3600000\n        -Dsolr.environment=test",
+        "new_string": "      - SOLR_OPTS=\n        -Dsolr.autoCommit.maxTime=3600000\n        -Dsolr.max.booleanClauses=30000\n        -Dsolr.environment=test",
+        "replace_all": false
+      },
+      "target_files": [
+        "scripts/solr_builder/compose.yaml"
+      ],
+      "target_symbols": [],
+      "hits_ground_truth": false,
+      "matched_ground_truth_files": [],
+      "is_subagent": false,
+      "cumulative_tokens": 112164,
+      "elapsed_seconds": 275.5
+    },
+    {
+      "step_index": 36,
+      "timestamp": "2026-02-27T15:43:06.537Z",
+      "tool_name": "Read",
+      "tool_category": "file_read",
+      "is_mcp": false,
+      "arguments": {
+        "file_path": "/workspace/setup.py"
+      },
+      "target_files": [
+        "setup.py"
+      ],
+      "target_symbols": [],
+      "hits_ground_truth": false,
+      "matched_ground_truth_files": [],
+      "is_subagent": false,
+      "cumulative_tokens": 120607,
+      "elapsed_seconds": 331.4
+    },
+    {
+      "step_index": 43,
+      "timestamp": "2026-02-27T15:43:55.978Z",
+      "tool_name": "Read",
+      "tool_category": "file_read",
+      "is_mcp": false,
+      "arguments": {
+        "file_path": "/workspace/tests/test_docker_compose.py"
+      },
+      "target_files": [
+        "tests/test_docker_compose.py"
+      ],
+      "target_symbols": [],
+      "hits_ground_truth": false,
+      "matched_ground_truth_files": [],
+      "is_subagent": false,
+      "cumulative_tokens": 131329,
+      "elapsed_seconds": 380.8
+    },
+    {
+      "step_index": 44,
+      "timestamp": "2026-02-27T15:44:01.767Z",
+      "tool_name": "Edit",
+      "tool_category": "file_write",
+      "is_mcp": false,
+      "arguments": {
+        "file_path": "/workspace/tests/test_docker_compose.py",
+        "old_string": "    def test_shared_constants(self):\n        # read the value in compose.yaml\n        with open(p(\"..\", \"compose.yaml\")) as f:\n            prod_dc: dict = yaml.safe_load(f)\n        solr_service = prod_dc[\"services\"][\"solr\"]\n        solr_opts = next(var.split(\"=\", 1)[1] for var in solr_service[\"environment\"] if var.startswith(\"SOLR_OPTS=\"))\n        solr_opts_max_boolean_clauses = next(int(opt.split(\"=\", 1)[1]) for opt in solr_opts.split() if opt.startswith(\"-Dsolr.max.booleanClauses\"))\n\n        # read the value in openlibrary/core/bookshelves.py\n        from openlibrary.core.bookshelves import FILTER_BOOK_LIMIT\n\n        assert solr_opts_max_boolean_clauses >= FILTER_BOOK_LIMIT",
+        "new_string": "    def test_shared_constants(self):\n        import re\n\n        # read the value in compose.yaml\n        with open(p(\"..\", \"compose.yaml\")) as f:\n            prod_dc: dict = yaml.safe_load(f)\n        solr_service = prod_dc[\"services\"][\"solr\"]\n        solr_opts = next(var.split(\"=\", 1)[1] for var in solr_service[\"environment\"] if var.startswith(\"SOLR_OPTS=\"))\n        solr_opts_max_boolean_clauses = next(int(opt.split(\"=\", 1)[1]) for opt in solr_opts.split() if opt.startswith(\"-Dsolr.max.booleanClauses\"))\n\n        # read the value in openlibrary/core/bookshelves.py using regex to avoid Python 3.12+ syntax issues\n        with open(p(\"..\", \"openlibrary/core/bookshelves.py\")) as f:\n            content = f.read()\n        match = re.search(r'FILTER_BOOK_LIMIT:\\s*Final\\s*=\\s*(\\d+(?:_\\d+)*)', content)\n        assert match, \"FILTER_BOOK_LIMIT not found in bookshelves.py\"\n        filter_book_limit = int(match.group(1).replace('_', ''))\n\n        assert solr_opts_max_boolean_clauses >= filter_book_limit",
+        "replace_all": false
+      },
+      "target_files": [
+        "tests/test_docker_compose.py"
+      ],
+      "target_symbols": [],
+      "hits_ground_truth": false,
+      "matched_ground_truth_files": [],
+      "is_subagent": false,
+      "cumulative_tokens": 134314,
+      "elapsed_seconds": 386.6
+    }
+  ],
+  "summary": {
+    "total_events": 10,
+    "mcp_events": 0,
+    "local_events": 10,
+    "unique_files_accessed": 6,
+    "ground_truth_files_hit": 1,
+    "ground_truth_files_total": 11,
+    "first_ground_truth_hit_step": 13,
+    "events_by_category": {
+      "file_read": 8,
+      "file_write": 2
+    }
+  }
+}