Improve pure RLM replay visibility

Shashikant86 · Shashikant86 · commit f7a693eeabdd · 2026-06-26T14:51:27.000+01:00
diff --git a/.gitignore b/.gitignore
@@ -153,6 +153,7 @@ cython_debug/
 
 # Project specific
 dspy_config.yaml
+rlm_config.yaml
 *.log
 
 # Internal workspace data directories (all data in CWD)
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,17 @@ All notable changes to this project are documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.1.9] - 2026-06-26
+
+### Added
+- Pure RLM runner context initialization from explicit workspace file references in the task, with compact repository snapshot fallback.
+- Context-load events for Pure RLM runs, including loaded file names and total context characters.
+- Runner JSONL replay coverage for action code, observations, success state, token counts, and cumulative reward.
+
+### Changed
+- TUI trajectory and replay views now show Pure RLM signals including REPL code, stdout/stderr previews, `llm_query` counts, executed code blocks, finalization status, and REPL variables.
+- Run visualization now includes richer Pure RLM previews for completed runs.
+
 ## [0.1.8] - 2026-05-01
 
 ### Added
@@ -76,5 +87,6 @@ Initial public release of **RLM Code**.
 
 [0.1.5]: https://github.com/SuperagenticAI/rlm-code/releases/tag/v0.1.5
 [0.1.6]: https://github.com/SuperagenticAI/rlm-code/releases/tag/v0.1.6
+[0.1.9]: https://github.com/SuperagenticAI/rlm-code/releases/tag/v0.1.9
 [0.1.8]: https://github.com/SuperagenticAI/rlm-code/releases/tag/v0.1.8
 [0.1.7]: https://github.com/SuperagenticAI/rlm-code/releases/tag/v0.1.7
diff --git a/README.md b/README.md
@@ -25,21 +25,20 @@ RLM Code implements the [Recursive Language Models](https://arxiv.org/abs/2502.0
 
 RLM Code wraps this algorithm in an interactive terminal UI with built-in benchmarks, trajectory replay, and observability.
 
-## Release v0.1.8
+## Release v0.1.9
 
-This release extends HALO/AHE-style trace analysis with layered evidence export.
+This release improves Pure RLM repository runs and makes completed trajectories more inspectable from the TUI and replay views.
 
-- New `trace_analysis` environment for diagnosing agent harness failures from OTel-shaped JSONL traces
-- Sidecar trace indexing with dataset overview, query, count, search, full-trace view, and selected-span view actions
-- AHE-style evidence corpus export with `overview.md`, per-trace detail reports, `index.json`, and optional processed raw JSONL spans
-- Bounded payload handling for large traces, including oversized summaries and higher-cap surgical span reads
-- `/rlm` help/docs updated for `env=trace_analysis`
-- Dedicated trace analysis docs under the Core Engine section
+- Pure RLM runs now initialize `context` from explicit workspace files mentioned in the task, with a compact repository snapshot fallback
+- Runner events now record context-load metadata for Pure RLM runs
+- Legacy runner JSONL step events replay with action code, observations, success, token counts, and cumulative reward
+- Run visualization now includes REPL code previews, stdout/stderr previews, `llm_query` counts, executed code blocks, finalization status, and REPL variables
+- TUI trajectory and replay views now surface Pure RLM signals directly for completed runs
 
 Example:
 
 ```text
-/rlm run "Find systemic harness failures trace=./traces.jsonl" env=trace_analysis steps=6
+/rlm run "Validate pure_rlm_environment.py and cite context, REPL, llm_query, and FINAL evidence" env=pure_rlm steps=6
 ```
 
 ## Documentation
diff --git a/docs/index.md b/docs/index.md
@@ -6,7 +6,7 @@
 
 <p class="rlm-tagline">Research Playground & Evaluation OS for Recursive Language Model Agentic Systems</p>
 
-<span class="rlm-badge rlm-badge--purple">v0.1.8</span>
+<span class="rlm-badge rlm-badge--purple">v0.1.9</span>
 <span class="rlm-badge rlm-badge--green">Python 3.11+</span>
 <span class="rlm-badge rlm-badge--blue">Apache 2.0</span>
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "rlm-code"
-version = "0.1.8"
+version = "0.1.9"
 description = "RLM Code: Research Playground & Evaluation OS for Recursive Language Model Agentic Systems"
 readme = "README.md"
 license = "Apache-2.0"
diff --git a/rlm_code/__init__.py b/rlm_code/__init__.py
@@ -5,5 +5,5 @@
 through natural language interactions.
 """
 
-__version__ = "0.1.8"
+__version__ = "0.1.9"
 __author__ = "Super Agentic AI"
diff --git a/rlm_code/mcp/__init__.py b/rlm_code/mcp/__init__.py
@@ -17,7 +17,7 @@
 )
 from .session_wrapper import MCPSessionWrapper
 
-__version__ = "0.1.8"
+__version__ = "0.1.9"
 
 __all__ = [
     "MCPClientManager",
diff --git a/rlm_code/rlm/runner.py b/rlm_code/rlm/runner.py
@@ -9,6 +9,7 @@
 
 import hashlib
 import json
+import re
 import threading
 import time
 from dataclasses import asdict, dataclass, is_dataclass
@@ -29,7 +30,7 @@
 )
 from .benchmarks import RLMBenchmarkCase, load_benchmark_packs
 from .chat_session import ChatSessionMixin
-from .context_store import LazyFileContext
+from .context_store import ContextRef, LazyFileContext
 from .delegation import DelegationMixin
 from .environments import (
     DSPyCodingRLMEnvironment,
@@ -467,6 +468,93 @@ def _build_pure_rlm_environment(self, workdir: Path | None = None) -> PureRLMEnv
             allow_unsafe_exec=(selected_backend == "exec" and self._pure_rlm_allow_unsafe_exec),
         )
 
+    def _extract_task_file_refs(self, task: str, limit: int = 12) -> list[ContextRef]:
+        """Find explicit workspace file references mentioned in a task string."""
+        candidates = re.findall(
+            r"(?<![\w.-])(?:[\w.-]+/)*[\w.-]+\.(?:py|md|toml|yaml|yml|json|txt|js|jsx|ts|tsx)",
+            task,
+        )
+        seen: set[str] = set()
+        refs: list[ContextRef] = []
+        for candidate in candidates:
+            normalized = candidate.strip().strip("`'\".,:;)")
+            if not normalized or normalized in seen:
+                continue
+            seen.add(normalized)
+            refs.append(ContextRef(path=normalized))
+            if len(refs) >= limit:
+                break
+        return refs
+
+    def _build_pure_rlm_initial_context(self, task: str) -> dict[str, str]:
+        """
+        Build a small real-code context for Pure RLM runs.
+
+        The direct PureRLMEnvironment API expects context to be initialized
+        explicitly.  Runner/TUI users expect `/rlm run ... env=pure_rlm` to
+        start with useful workspace data, so we seed `context` with explicit
+        files named in the task, falling back to a compact repository snapshot.
+        """
+        refs = self._extract_task_file_refs(task)
+        if not refs:
+            refs = self.context_store.discover(limit=12)
+
+        context: dict[str, str] = {}
+        for ref in refs:
+            snippet = self.context_store.read(ref, max_chars=12000)
+            if snippet:
+                context[ref.path] = snippet
+
+        if context:
+            return context
+
+        discovered = self.context_store.discover(limit=80)
+        tree = "\n".join(ref.path for ref in discovered)
+        return {
+            "_workspace": (
+                f"Workspace: {self.workdir}\n"
+                "No explicit file snippets were loaded. Available files:\n"
+                f"{tree}"
+            ).strip()
+        }
+
+    def _initialize_pure_rlm_run_context(
+        self,
+        env: RLMEnvironment,
+        task: str,
+        *,
+        run_id: str,
+        run_path: Path,
+    ) -> int:
+        """Initialize `context` for Pure RLM runs and persist a context event."""
+        if env.name != "pure_rlm" or not hasattr(env, "initialize_context"):
+            return 0
+
+        context = self._build_pure_rlm_initial_context(task)
+        env.initialize_context(
+            context,
+            description="Workspace files selected for this Pure RLM run",
+            additional_vars={"query": task},
+        )
+        context_event = {
+            "type": "context",
+            "run_id": run_id,
+            "environment": env.name,
+            "timestamp": self._utc_now(),
+            "context_files": list(context.keys()),
+            "context_chars": sum(len(value) for value in context.values()),
+        }
+        self._append_event(run_path, context_event)
+        self._emit_runtime_event(
+            "context_load",
+            {
+                "run_id": run_id,
+                "files": len(context),
+                "chars": context_event["context_chars"],
+            },
+        )
+        return len(context)
+
     def run_task(
         self,
         task: str,
@@ -596,6 +684,12 @@ def run_task(
         final_response = ""
         cancelled = False
         trajectory: list[dict[str, Any]] = []
+        context_files = self._initialize_pure_rlm_run_context(
+            env,
+            cleaned_task,
+            run_id=run_id,
+            run_path=run_path,
+        )
         usage_start = self._usage_snapshot()
         self.observability.on_run_start(
             run_id,
@@ -616,6 +710,7 @@ def run_task(
                 "parent_run_id": _parent_run_id,
                 "pure_rlm_backend": self._pure_rlm_backend if env.name == "pure_rlm" else None,
                 "pure_rlm_strict": strict_pure_mode if env.name == "pure_rlm" else None,
+                "context_files": context_files if env.name == "pure_rlm" else None,
             },
         )
         self._emit_runtime_event(
@@ -627,6 +722,7 @@ def run_task(
                 "framework": native_framework,
                 "depth": _depth,
                 "parent_run_id": _parent_run_id,
+                "context_files": context_files if env.name == "pure_rlm" else None,
             },
         )
 
diff --git a/rlm_code/rlm/session_replay.py b/rlm_code/rlm/session_replay.py
@@ -1035,14 +1035,30 @@ def _convert_legacy_step(data: dict[str, Any]) -> SessionEvent:
     step_type = data.get("type", "")
 
     if step_type == "step":
+        observation = data.get("observation", {})
+        observation_dict = observation if isinstance(observation, dict) else {}
+        action = data.get("action", {})
+        action_dict = action if isinstance(action, dict) else {}
+        success = observation_dict.get("success")
+        if success is None:
+            success = not bool(observation_dict.get("error") or observation_dict.get("stderr"))
+        usage = data.get("usage", {})
+        usage_dict = usage if isinstance(usage, dict) else {}
         return SessionEvent(
             event_type=SessionEventType.STEP_END,
             timestamp=data.get("timestamp", _utc_now()),
-            step=data.get("step", 0),
+            step=int(data.get("step", 0) or 0),
             data={
-                "action": data.get("action", {}),
-                "observation": data.get("observation", {}),
+                "step": int(data.get("step", 0) or 0),
+                "timestamp": data.get("timestamp", _utc_now()),
+                "action": action_dict,
+                "observation": observation_dict,
                 "reward": data.get("reward", 0.0),
+                "success": bool(success),
+                "tokens_used": int(
+                    usage_dict.get("prompt_tokens", 0) or 0
+                )
+                + int(usage_dict.get("completion_tokens", 0) or 0),
             },
             run_id=data.get("run_id", ""),
             depth=data.get("depth", 0),
@@ -1125,25 +1141,35 @@ def _build_snapshot_from_events(
 
         elif event.event_type == SessionEventType.STEP_END:
             # Build StepState from accumulated data
+            if "step" not in current_step_data:
+                current_step_data = {
+                    "step": int(event.data.get("step", event.step) or 0),
+                    "timestamp": str(event.data.get("timestamp", event.timestamp) or ""),
+                }
             if "step" in current_step_data:
                 # Merge any additional data from STEP_END event
                 if "action" in event.data:
                     action = event.data["action"]
                     current_step_data.setdefault("action_type", action.get("action", ""))
                     current_step_data.setdefault("action_code", action.get("code", ""))
+                    current_step_data.setdefault("action_rationale", action.get("reasoning", ""))
                     current_step_data.setdefault("raw_action", action)
                 if "observation" in event.data:
                     obs = event.data["observation"]
                     current_step_data.setdefault("output", obs.get("output", obs.get("stdout", "")))
                     current_step_data.setdefault("error", obs.get("error", obs.get("stderr", "")))
                     current_step_data.setdefault("raw_observation", obs)
                 if "reward" in event.data:
+                    reward = float(event.data.get("reward", 0.0) or 0.0)
+                    cumulative = event.data.get("cumulative_reward")
+                    if cumulative is None:
+                        cumulative = total_reward + reward
                     current_step_data.setdefault("reward", event.data["reward"])
-                    current_step_data.setdefault(
-                        "cumulative_reward", event.data.get("cumulative_reward", 0.0)
-                    )
+                    current_step_data.setdefault("cumulative_reward", cumulative)
                 if "success" in event.data:
                     current_step_data.setdefault("success", event.data["success"])
+                if "tokens_used" in event.data:
+                    current_step_data.setdefault("tokens_used", event.data["tokens_used"])
 
                 step_state = StepState(
                     step=current_step_data.get("step", 0),
@@ -1163,6 +1189,8 @@ def _build_snapshot_from_events(
                     raw_observation=current_step_data.get("raw_observation", {}),
                 )
                 steps.append(step_state)
+                total_reward = float(step_state.cumulative_reward)
+                total_tokens += int(step_state.tokens_used or 0)
                 current_step_data = {}
 
         elif event.event_type == SessionEventType.MEMORY_UPDATE:
diff --git a/rlm_code/rlm/visualizer.py b/rlm_code/rlm/visualizer.py
@@ -62,6 +62,16 @@ def build_run_visualization(
             "success": observation_dict.get("success") if "success" in observation_dict else None,
             "path": str(observation_dict.get("path") or ""),
             "children_executed": int(observation_dict.get("children_executed") or 0),
+            "planner_preview": _clip_text(str(step.get("planner_raw") or ""), limit=260),
+            "code_preview": _clip_text(_action_code(step), limit=260),
+            "stdout_preview": _clip_text(str(observation_dict.get("stdout") or ""), limit=260),
+            "stderr_preview": _clip_text(str(observation_dict.get("stderr") or ""), limit=180),
+            "llm_calls_made": int(observation_dict.get("llm_calls_made") or 0),
+            "code_blocks_executed": int(observation_dict.get("code_blocks_executed") or 0),
+            "final_detected": bool(observation_dict.get("final_detected", False)),
+            "repl_variables": list(observation_dict.get("repl_variables") or [])[:20]
+            if isinstance(observation_dict.get("repl_variables"), list)
+            else [],
         }
         error = _extract_error(step)
         if error:
@@ -190,6 +200,19 @@ def _action_name(step: dict[str, Any]) -> str:
     return "unknown"
 
 
+def _action_code(step: dict[str, Any]) -> str:
+    action = step.get("action")
+    if not isinstance(action, dict):
+        return ""
+    code = action.get("code")
+    if isinstance(code, str) and code.strip():
+        return code
+    blocks = action.get("_code_blocks")
+    if isinstance(blocks, list):
+        return "\n\n".join(str(block) for block in blocks if str(block).strip())
+    return ""
+
+
 def _extract_error(step: dict[str, Any]) -> str:
     observation = step.get("observation")
     if not isinstance(observation, dict):
diff --git a/rlm_code/ui/tui_app.py b/rlm_code/ui/tui_app.py
diff --git a/tests/rlm/test_session_replay.py b/tests/rlm/test_session_replay.py
diff --git a/tests/test_rlm_runner.py b/tests/test_rlm_runner.py
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -17,7 +17,7 @@`
`17`	`17`	`)`
`18`	`18`	`from .session_wrapper import MCPSessionWrapper`
`19`	`19`
`20`		`-__version__ = "0.1.8"`
	`20`	`+__version__ = "0.1.9"`
`21`	`21`
`22`	`22`	`__all__ = [`
`23`	`23`	`"MCPClientManager",`