Skip to content

Commit f7a693e

Browse files
committed
Improve pure RLM replay visibility
1 parent 1036ac5 commit f7a693e

14 files changed

Lines changed: 356 additions & 27 deletions

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,7 @@ cython_debug/
153153

154154
# Project specific
155155
dspy_config.yaml
156+
rlm_config.yaml
156157
*.log
157158

158159
# Internal workspace data directories (all data in CWD)

CHANGELOG.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,17 @@ All notable changes to this project are documented in this file.
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8+
## [0.1.9] - 2026-06-26
9+
10+
### Added
11+
- Pure RLM runner context initialization from explicit workspace file references in the task, with compact repository snapshot fallback.
12+
- Context-load events for Pure RLM runs, including loaded file names and total context characters.
13+
- Runner JSONL replay coverage for action code, observations, success state, token counts, and cumulative reward.
14+
15+
### Changed
16+
- TUI trajectory and replay views now show Pure RLM signals including REPL code, stdout/stderr previews, `llm_query` counts, executed code blocks, finalization status, and REPL variables.
17+
- Run visualization now includes richer Pure RLM previews for completed runs.
18+
819
## [0.1.8] - 2026-05-01
920

1021
### Added
@@ -76,5 +87,6 @@ Initial public release of **RLM Code**.
7687

7788
[0.1.5]: https://github.com/SuperagenticAI/rlm-code/releases/tag/v0.1.5
7889
[0.1.6]: https://github.com/SuperagenticAI/rlm-code/releases/tag/v0.1.6
90+
[0.1.9]: https://github.com/SuperagenticAI/rlm-code/releases/tag/v0.1.9
7991
[0.1.8]: https://github.com/SuperagenticAI/rlm-code/releases/tag/v0.1.8
8092
[0.1.7]: https://github.com/SuperagenticAI/rlm-code/releases/tag/v0.1.7

README.md

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -25,21 +25,20 @@ RLM Code implements the [Recursive Language Models](https://arxiv.org/abs/2502.0
2525

2626
RLM Code wraps this algorithm in an interactive terminal UI with built-in benchmarks, trajectory replay, and observability.
2727

28-
## Release v0.1.8
28+
## Release v0.1.9
2929

30-
This release extends HALO/AHE-style trace analysis with layered evidence export.
30+
This release improves Pure RLM repository runs and makes completed trajectories more inspectable from the TUI and replay views.
3131

32-
- New `trace_analysis` environment for diagnosing agent harness failures from OTel-shaped JSONL traces
33-
- Sidecar trace indexing with dataset overview, query, count, search, full-trace view, and selected-span view actions
34-
- AHE-style evidence corpus export with `overview.md`, per-trace detail reports, `index.json`, and optional processed raw JSONL spans
35-
- Bounded payload handling for large traces, including oversized summaries and higher-cap surgical span reads
36-
- `/rlm` help/docs updated for `env=trace_analysis`
37-
- Dedicated trace analysis docs under the Core Engine section
32+
- Pure RLM runs now initialize `context` from explicit workspace files mentioned in the task, with a compact repository snapshot fallback
33+
- Runner events now record context-load metadata for Pure RLM runs
34+
- Legacy runner JSONL step events replay with action code, observations, success, token counts, and cumulative reward
35+
- Run visualization now includes REPL code previews, stdout/stderr previews, `llm_query` counts, executed code blocks, finalization status, and REPL variables
36+
- TUI trajectory and replay views now surface Pure RLM signals directly for completed runs
3837

3938
Example:
4039

4140
```text
42-
/rlm run "Find systemic harness failures trace=./traces.jsonl" env=trace_analysis steps=6
41+
/rlm run "Validate pure_rlm_environment.py and cite context, REPL, llm_query, and FINAL evidence" env=pure_rlm steps=6
4342
```
4443

4544
## Documentation

docs/index.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
<p class="rlm-tagline">Research Playground & Evaluation OS for Recursive Language Model Agentic Systems</p>
88

9-
<span class="rlm-badge rlm-badge--purple">v0.1.8</span>
9+
<span class="rlm-badge rlm-badge--purple">v0.1.9</span>
1010
<span class="rlm-badge rlm-badge--green">Python 3.11+</span>
1111
<span class="rlm-badge rlm-badge--blue">Apache 2.0</span>
1212

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
44

55
[project]
66
name = "rlm-code"
7-
version = "0.1.8"
7+
version = "0.1.9"
88
description = "RLM Code: Research Playground & Evaluation OS for Recursive Language Model Agentic Systems"
99
readme = "README.md"
1010
license = "Apache-2.0"

rlm_code/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,5 @@
55
through natural language interactions.
66
"""
77

8-
__version__ = "0.1.8"
8+
__version__ = "0.1.9"
99
__author__ = "Super Agentic AI"

rlm_code/mcp/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
)
1818
from .session_wrapper import MCPSessionWrapper
1919

20-
__version__ = "0.1.8"
20+
__version__ = "0.1.9"
2121

2222
__all__ = [
2323
"MCPClientManager",

rlm_code/rlm/runner.py

Lines changed: 97 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
import hashlib
1111
import json
12+
import re
1213
import threading
1314
import time
1415
from dataclasses import asdict, dataclass, is_dataclass
@@ -29,7 +30,7 @@
2930
)
3031
from .benchmarks import RLMBenchmarkCase, load_benchmark_packs
3132
from .chat_session import ChatSessionMixin
32-
from .context_store import LazyFileContext
33+
from .context_store import ContextRef, LazyFileContext
3334
from .delegation import DelegationMixin
3435
from .environments import (
3536
DSPyCodingRLMEnvironment,
@@ -467,6 +468,93 @@ def _build_pure_rlm_environment(self, workdir: Path | None = None) -> PureRLMEnv
467468
allow_unsafe_exec=(selected_backend == "exec" and self._pure_rlm_allow_unsafe_exec),
468469
)
469470

471+
def _extract_task_file_refs(self, task: str, limit: int = 12) -> list[ContextRef]:
472+
"""Find explicit workspace file references mentioned in a task string."""
473+
candidates = re.findall(
474+
r"(?<![\w.-])(?:[\w.-]+/)*[\w.-]+\.(?:py|md|toml|yaml|yml|json|txt|js|jsx|ts|tsx)",
475+
task,
476+
)
477+
seen: set[str] = set()
478+
refs: list[ContextRef] = []
479+
for candidate in candidates:
480+
normalized = candidate.strip().strip("`'\".,:;)")
481+
if not normalized or normalized in seen:
482+
continue
483+
seen.add(normalized)
484+
refs.append(ContextRef(path=normalized))
485+
if len(refs) >= limit:
486+
break
487+
return refs
488+
489+
def _build_pure_rlm_initial_context(self, task: str) -> dict[str, str]:
490+
"""
491+
Build a small real-code context for Pure RLM runs.
492+
493+
The direct PureRLMEnvironment API expects context to be initialized
494+
explicitly. Runner/TUI users expect `/rlm run ... env=pure_rlm` to
495+
start with useful workspace data, so we seed `context` with explicit
496+
files named in the task, falling back to a compact repository snapshot.
497+
"""
498+
refs = self._extract_task_file_refs(task)
499+
if not refs:
500+
refs = self.context_store.discover(limit=12)
501+
502+
context: dict[str, str] = {}
503+
for ref in refs:
504+
snippet = self.context_store.read(ref, max_chars=12000)
505+
if snippet:
506+
context[ref.path] = snippet
507+
508+
if context:
509+
return context
510+
511+
discovered = self.context_store.discover(limit=80)
512+
tree = "\n".join(ref.path for ref in discovered)
513+
return {
514+
"_workspace": (
515+
f"Workspace: {self.workdir}\n"
516+
"No explicit file snippets were loaded. Available files:\n"
517+
f"{tree}"
518+
).strip()
519+
}
520+
521+
def _initialize_pure_rlm_run_context(
522+
self,
523+
env: RLMEnvironment,
524+
task: str,
525+
*,
526+
run_id: str,
527+
run_path: Path,
528+
) -> int:
529+
"""Initialize `context` for Pure RLM runs and persist a context event."""
530+
if env.name != "pure_rlm" or not hasattr(env, "initialize_context"):
531+
return 0
532+
533+
context = self._build_pure_rlm_initial_context(task)
534+
env.initialize_context(
535+
context,
536+
description="Workspace files selected for this Pure RLM run",
537+
additional_vars={"query": task},
538+
)
539+
context_event = {
540+
"type": "context",
541+
"run_id": run_id,
542+
"environment": env.name,
543+
"timestamp": self._utc_now(),
544+
"context_files": list(context.keys()),
545+
"context_chars": sum(len(value) for value in context.values()),
546+
}
547+
self._append_event(run_path, context_event)
548+
self._emit_runtime_event(
549+
"context_load",
550+
{
551+
"run_id": run_id,
552+
"files": len(context),
553+
"chars": context_event["context_chars"],
554+
},
555+
)
556+
return len(context)
557+
470558
def run_task(
471559
self,
472560
task: str,
@@ -596,6 +684,12 @@ def run_task(
596684
final_response = ""
597685
cancelled = False
598686
trajectory: list[dict[str, Any]] = []
687+
context_files = self._initialize_pure_rlm_run_context(
688+
env,
689+
cleaned_task,
690+
run_id=run_id,
691+
run_path=run_path,
692+
)
599693
usage_start = self._usage_snapshot()
600694
self.observability.on_run_start(
601695
run_id,
@@ -616,6 +710,7 @@ def run_task(
616710
"parent_run_id": _parent_run_id,
617711
"pure_rlm_backend": self._pure_rlm_backend if env.name == "pure_rlm" else None,
618712
"pure_rlm_strict": strict_pure_mode if env.name == "pure_rlm" else None,
713+
"context_files": context_files if env.name == "pure_rlm" else None,
619714
},
620715
)
621716
self._emit_runtime_event(
@@ -627,6 +722,7 @@ def run_task(
627722
"framework": native_framework,
628723
"depth": _depth,
629724
"parent_run_id": _parent_run_id,
725+
"context_files": context_files if env.name == "pure_rlm" else None,
630726
},
631727
)
632728

rlm_code/rlm/session_replay.py

Lines changed: 34 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1035,14 +1035,30 @@ def _convert_legacy_step(data: dict[str, Any]) -> SessionEvent:
10351035
step_type = data.get("type", "")
10361036

10371037
if step_type == "step":
1038+
observation = data.get("observation", {})
1039+
observation_dict = observation if isinstance(observation, dict) else {}
1040+
action = data.get("action", {})
1041+
action_dict = action if isinstance(action, dict) else {}
1042+
success = observation_dict.get("success")
1043+
if success is None:
1044+
success = not bool(observation_dict.get("error") or observation_dict.get("stderr"))
1045+
usage = data.get("usage", {})
1046+
usage_dict = usage if isinstance(usage, dict) else {}
10381047
return SessionEvent(
10391048
event_type=SessionEventType.STEP_END,
10401049
timestamp=data.get("timestamp", _utc_now()),
1041-
step=data.get("step", 0),
1050+
step=int(data.get("step", 0) or 0),
10421051
data={
1043-
"action": data.get("action", {}),
1044-
"observation": data.get("observation", {}),
1052+
"step": int(data.get("step", 0) or 0),
1053+
"timestamp": data.get("timestamp", _utc_now()),
1054+
"action": action_dict,
1055+
"observation": observation_dict,
10451056
"reward": data.get("reward", 0.0),
1057+
"success": bool(success),
1058+
"tokens_used": int(
1059+
usage_dict.get("prompt_tokens", 0) or 0
1060+
)
1061+
+ int(usage_dict.get("completion_tokens", 0) or 0),
10461062
},
10471063
run_id=data.get("run_id", ""),
10481064
depth=data.get("depth", 0),
@@ -1125,25 +1141,35 @@ def _build_snapshot_from_events(
11251141

11261142
elif event.event_type == SessionEventType.STEP_END:
11271143
# Build StepState from accumulated data
1144+
if "step" not in current_step_data:
1145+
current_step_data = {
1146+
"step": int(event.data.get("step", event.step) or 0),
1147+
"timestamp": str(event.data.get("timestamp", event.timestamp) or ""),
1148+
}
11281149
if "step" in current_step_data:
11291150
# Merge any additional data from STEP_END event
11301151
if "action" in event.data:
11311152
action = event.data["action"]
11321153
current_step_data.setdefault("action_type", action.get("action", ""))
11331154
current_step_data.setdefault("action_code", action.get("code", ""))
1155+
current_step_data.setdefault("action_rationale", action.get("reasoning", ""))
11341156
current_step_data.setdefault("raw_action", action)
11351157
if "observation" in event.data:
11361158
obs = event.data["observation"]
11371159
current_step_data.setdefault("output", obs.get("output", obs.get("stdout", "")))
11381160
current_step_data.setdefault("error", obs.get("error", obs.get("stderr", "")))
11391161
current_step_data.setdefault("raw_observation", obs)
11401162
if "reward" in event.data:
1163+
reward = float(event.data.get("reward", 0.0) or 0.0)
1164+
cumulative = event.data.get("cumulative_reward")
1165+
if cumulative is None:
1166+
cumulative = total_reward + reward
11411167
current_step_data.setdefault("reward", event.data["reward"])
1142-
current_step_data.setdefault(
1143-
"cumulative_reward", event.data.get("cumulative_reward", 0.0)
1144-
)
1168+
current_step_data.setdefault("cumulative_reward", cumulative)
11451169
if "success" in event.data:
11461170
current_step_data.setdefault("success", event.data["success"])
1171+
if "tokens_used" in event.data:
1172+
current_step_data.setdefault("tokens_used", event.data["tokens_used"])
11471173

11481174
step_state = StepState(
11491175
step=current_step_data.get("step", 0),
@@ -1163,6 +1189,8 @@ def _build_snapshot_from_events(
11631189
raw_observation=current_step_data.get("raw_observation", {}),
11641190
)
11651191
steps.append(step_state)
1192+
total_reward = float(step_state.cumulative_reward)
1193+
total_tokens += int(step_state.tokens_used or 0)
11661194
current_step_data = {}
11671195

11681196
elif event.event_type == SessionEventType.MEMORY_UPDATE:

rlm_code/rlm/visualizer.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,16 @@ def build_run_visualization(
6262
"success": observation_dict.get("success") if "success" in observation_dict else None,
6363
"path": str(observation_dict.get("path") or ""),
6464
"children_executed": int(observation_dict.get("children_executed") or 0),
65+
"planner_preview": _clip_text(str(step.get("planner_raw") or ""), limit=260),
66+
"code_preview": _clip_text(_action_code(step), limit=260),
67+
"stdout_preview": _clip_text(str(observation_dict.get("stdout") or ""), limit=260),
68+
"stderr_preview": _clip_text(str(observation_dict.get("stderr") or ""), limit=180),
69+
"llm_calls_made": int(observation_dict.get("llm_calls_made") or 0),
70+
"code_blocks_executed": int(observation_dict.get("code_blocks_executed") or 0),
71+
"final_detected": bool(observation_dict.get("final_detected", False)),
72+
"repl_variables": list(observation_dict.get("repl_variables") or [])[:20]
73+
if isinstance(observation_dict.get("repl_variables"), list)
74+
else [],
6575
}
6676
error = _extract_error(step)
6777
if error:
@@ -190,6 +200,19 @@ def _action_name(step: dict[str, Any]) -> str:
190200
return "unknown"
191201

192202

203+
def _action_code(step: dict[str, Any]) -> str:
204+
action = step.get("action")
205+
if not isinstance(action, dict):
206+
return ""
207+
code = action.get("code")
208+
if isinstance(code, str) and code.strip():
209+
return code
210+
blocks = action.get("_code_blocks")
211+
if isinstance(blocks, list):
212+
return "\n\n".join(str(block) for block in blocks if str(block).strip())
213+
return ""
214+
215+
193216
def _extract_error(step: dict[str, Any]) -> str:
194217
observation = step.get("observation")
195218
if not isinstance(observation, dict):

0 commit comments

Comments
 (0)