Add Agentic harness engineering concepts

Shashikant86 · Shashikant86 · commit 1036ac5d9469 · 2026-05-01T02:08:13.000+01:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,13 @@ All notable changes to this project are documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.1.8] - 2026-05-01
+
+### Added
+- AHE-style layered trace evidence corpus export from `TraceStore`.
+- New `trace_analysis` action `export_evidence_corpus` for writing `overview.md`, per-trace detail reports, `index.json`, and optional processed raw JSONL spans.
+- Evidence corpus tests covering direct store export and environment action export.
+
 ## [0.1.7] - 2026-04-30
 
 ### Added
@@ -69,4 +76,5 @@ Initial public release of **RLM Code**.
 
 [0.1.5]: https://github.com/SuperagenticAI/rlm-code/releases/tag/v0.1.5
 [0.1.6]: https://github.com/SuperagenticAI/rlm-code/releases/tag/v0.1.6
+[0.1.8]: https://github.com/SuperagenticAI/rlm-code/releases/tag/v0.1.8
 [0.1.7]: https://github.com/SuperagenticAI/rlm-code/releases/tag/v0.1.7
diff --git a/README.md b/README.md
@@ -25,12 +25,13 @@ RLM Code implements the [Recursive Language Models](https://arxiv.org/abs/2502.0
 
 RLM Code wraps this algorithm in an interactive terminal UI with built-in benchmarks, trajectory replay, and observability.
 
-## Release v0.1.7
+## Release v0.1.8
 
-This release adds HALO-style trace analysis as a new RLM environment.
+This release extends HALO/AHE-style trace analysis with layered evidence export.
 
 - New `trace_analysis` environment for diagnosing agent harness failures from OTel-shaped JSONL traces
 - Sidecar trace indexing with dataset overview, query, count, search, full-trace view, and selected-span view actions
+- AHE-style evidence corpus export with `overview.md`, per-trace detail reports, `index.json`, and optional processed raw JSONL spans
 - Bounded payload handling for large traces, including oversized summaries and higher-cap surgical span reads
 - `/rlm` help/docs updated for `env=trace_analysis`
 - Dedicated trace analysis docs under the Core Engine section
diff --git a/docs/core/trace-analysis.md b/docs/core/trace-analysis.md
@@ -8,6 +8,11 @@ sidecar cache, exposes bounded trace-inspection actions to the RLM planner, and
 keeps large payloads under control by returning summaries or selected spans
 instead of blindly loading full traces into context.
 
+It can also export an AHE-style layered evidence corpus for downstream coding
+agents or `meta-harness`: a benchmark-level `overview.md`, one detail report per
+selected trace, an `index.json`, and optional processed raw JSONL span files for
+drill-down.
+
 ## Usage
 
 ```text
@@ -30,11 +35,43 @@ The environment supports these planner actions:
 | `view_trace` | Read all spans for a small trace, or return an oversized summary |
 | `search_trace` | Search one trace for a literal substring |
 | `view_spans` | Read selected spans at a higher per-attribute cap |
+| `export_evidence_corpus` | Write layered evidence files for downstream harness optimization |
 | `final` | Return the final evidence report |
 
 Supported filters are `has_errors`, `model_names`, `service_names`,
 `agent_names`, and `project_id`.
 
+## Evidence Corpus Export
+
+Use `export_evidence_corpus` when a report should be handed to another coding
+agent or to `meta-harness --trace-evidence`.
+
+Planner action shape:
+
+```json
+{
+  "action": "export_evidence_corpus",
+  "output_dir": "./trace-evidence",
+  "filters": {"has_errors": true},
+  "limit": 100,
+  "include_raw": true
+}
+```
+
+The output directory contains:
+
+- `overview.md`: compact entry point with dataset counts and links to detail files
+- `detail/<trace-id>.md`: per-trace summary, task ids, error spans, and tool-like spans
+- `raw/<trace-id>.jsonl`: processed selected raw spans for drill-down when `include_raw` is true
+- `index.json`: machine-readable corpus metadata and trace file references
+
+For MetaHarness, pass the generated overview directly:
+
+```bash
+uv run metaharness run ./my-harness \
+  --trace-evidence ./trace-evidence/overview.md
+```
+
 ## Trace Shape
 
 The first implementation expects one JSON object per line. Each line should
diff --git a/docs/index.md b/docs/index.md
@@ -6,7 +6,7 @@
 
 <p class="rlm-tagline">Research Playground & Evaluation OS for Recursive Language Model Agentic Systems</p>
 
-<span class="rlm-badge rlm-badge--purple">v0.1.7</span>
+<span class="rlm-badge rlm-badge--purple">v0.1.8</span>
 <span class="rlm-badge rlm-badge--green">Python 3.11+</span>
 <span class="rlm-badge rlm-badge--blue">Apache 2.0</span>
 
@@ -47,7 +47,7 @@ Run **Pure RLM** (paper-compliant with context-as-variable), **CodeAct** (contex
 <div class="rlm-feature-card" markdown>
 
 ### 🔎 Trace Analysis
-Run HALO-style trace diagnosis with `env=trace_analysis` over OTel-shaped JSONL traces to find repeated harness failure modes.
+Run HALO/AHE-style trace diagnosis with `env=trace_analysis` over OTel-shaped JSONL traces, then export layered evidence for MetaHarness.
 
 </div>
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "rlm-code"
-version = "0.1.7"
+version = "0.1.8"
 description = "RLM Code: Research Playground & Evaluation OS for Recursive Language Model Agentic Systems"
 readme = "README.md"
 license = "Apache-2.0"
diff --git a/rlm_code/__init__.py b/rlm_code/__init__.py
@@ -5,5 +5,5 @@
 through natural language interactions.
 """
 
-__version__ = "0.1.7"
+__version__ = "0.1.8"
 __author__ = "Super Agentic AI"
diff --git a/rlm_code/mcp/__init__.py b/rlm_code/mcp/__init__.py
@@ -17,7 +17,7 @@
 )
 from .session_wrapper import MCPSessionWrapper
 
-__version__ = "0.1.7"
+__version__ = "0.1.8"
 
 __all__ = [
     "MCPClientManager",
diff --git a/rlm_code/rlm/environments.py b/rlm_code/rlm/environments.py
@@ -306,8 +306,10 @@ def system_prompt(self) -> str:
             "Return ONLY valid JSON object with keys:\n"
             "{"
             '"action": "set_trace_path" | "get_dataset_overview" | "query_traces" | '
-            '"count_traces" | "view_trace" | "search_trace" | "view_spans" | "final", '
+            '"count_traces" | "view_trace" | "search_trace" | "view_spans" | '
+            '"export_evidence_corpus" | "final", '
             '"trace_path": "<path to JSONL traces>", '
+            '"output_dir": "<directory for exported evidence corpus>", '
             '"filters": {"has_errors": true, "model_names": ["..."], "service_names": ["..."], '
             '"agent_names": ["..."], "project_id": "..."}, '
             '"trace_id": "<trace id>", '
@@ -324,6 +326,7 @@ def system_prompt(self) -> str:
             "- Always begin analysis with get_dataset_overview.\n"
             "- Use query_traces to choose real trace ids; never invent trace ids.\n"
             "- For large traces, prefer search_trace followed by view_spans.\n"
+            "- Use export_evidence_corpus when the caller needs files for MetaHarness or another coding agent.\n"
             "- Identify systemic harness failures, not one-off anomalies.\n"
             "- Output JSON only."
         )
@@ -448,6 +451,21 @@ def execute_action(
                     reward=0.7,
                     memory_note=f"Viewed selected spans for trace {trace_id}.",
                 )
+            if action_name == "export_evidence_corpus":
+                output_dir = self._required_str(action, "output_dir")
+                resolved_output = Path(output_dir).expanduser()
+                if not resolved_output.is_absolute():
+                    resolved_output = self.workdir / resolved_output
+                return self._ok(
+                    observation=store.export_evidence_corpus(
+                        resolved_output,
+                        filters,
+                        limit=self._int_arg(action, "limit", 100, minimum=1, maximum=1000),
+                        include_raw=self._bool_arg(action, "include_raw", True),
+                    ),
+                    reward=0.75,
+                    memory_note="Exported layered trace evidence corpus.",
+                )
         except Exception as exc:
             return EnvironmentActionResult(
                 observation={"success": False, "error": f"{type(exc).__name__}: {exc}"},
@@ -530,6 +548,19 @@ def _int_arg(
             parsed = default
         return max(minimum, min(maximum, parsed))
 
+    @staticmethod
+    def _bool_arg(action: dict[str, Any], key: str, default: bool) -> bool:
+        value = action.get(key, default)
+        if isinstance(value, bool):
+            return value
+        if isinstance(value, str):
+            normalized = value.strip().lower()
+            if normalized in {"1", "true", "yes", "on"}:
+                return True
+            if normalized in {"0", "false", "no", "off"}:
+                return False
+        return default
+
 
 class DSPyCodingRLMEnvironment(GenericRLMEnvironment):
     """DSPy-focused environment with file edit + tests + DSPy-aware scoring."""
diff --git a/rlm_code/traces/store.py b/rlm_code/traces/store.py
diff --git a/tests/test_trace_analysis.py b/tests/test_trace_analysis.py
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -17,7 +17,7 @@`
`17`	`17`	`)`
`18`	`18`	`from .session_wrapper import MCPSessionWrapper`
`19`	`19`
`20`		`-__version__ = "0.1.7"`
	`20`	`+__version__ = "0.1.8"`
`21`	`21`
`22`	`22`	`__all__ = [`
`23`	`23`	`"MCPClientManager",`