CodeGhost21
diff --git a/‎docs/AGENT_SELF_LEARNING.md‎
Lines changed: 368 additions & 0 deletions b/‎docs/AGENT_SELF_LEARNING.md‎
Lines changed: 368 additions & 0 deletions
diff --git a/‎src/core/event_bus/events.rs‎
Lines changed: 53 additions & 1 deletion b/‎src/core/event_bus/events.rs‎
Lines changed: 53 additions & 1 deletion
diff --git a/‎src/core/event_bus/events_tests.rs‎
Lines changed: 23 additions & 0 deletions b/‎src/core/event_bus/events_tests.rs‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎src/openhuman/agent/harness/archivist.rs‎
Lines changed: 186 additions & 0 deletions b/‎src/openhuman/agent/harness/archivist.rs‎
Lines changed: 186 additions & 0 deletions
diff --git a/‎src/openhuman/agent/harness/session/builder.rs‎
Lines changed: 31 additions & 0 deletions b/‎src/openhuman/agent/harness/session/builder.rs‎
Lines changed: 31 additions & 0 deletions
@@ -353,6 +353,55 @@ pub enum DomainEvent {
         routed: bool,
     },
 
+    // ── Memory tree ─────────────────────────────────────────────────────
+    /// A document (chat batch, email thread, or standalone document) was
+    /// fully canonicalised and its chunks written to the memory tree.
+    ///
+    /// Emitted by `memory::tree::ingest::persist()` after the chunk upsert
+    /// and extract-job enqueue complete. Subscribers (Phase 2 producers such
+    /// as the email-signature parser) react to this to inspect the
+    /// canonicalised content.
+    DocumentCanonicalized {
+        /// The source identifier passed to the ingest call (e.g. `"gmail:abc"`,
+        /// `"conversations:agent"`).
+        source_id: String,
+        /// Kind of content — `"chat"`, `"email"`, `"document"`.
+        source_kind: String,
+        /// Number of chunks written to `vector_chunks` in this ingest.
+        chunks_written: usize,
+        /// IDs of the chunks that were written.
+        chunk_ids: Vec<String>,
+        /// Wall-clock seconds since epoch when canonicalisation completed.
+        canonicalized_at: f64,
+        /// Last ≤ 2 048 characters of the canonicalised markdown body.
+        ///
+        /// Populated for `email` and `document` sources so that lightweight
+        /// subscribers (e.g. the email-signature parser) can inspect trailing
+        /// content without hitting disk. `None` for `chat` sources where the
+        /// content is conversational and doesn't contain signature-style structure.
+        body_preview: Option<String>,
+    },
+
+    // ── Learning ─────────────────────────────────────────────────────────
+    /// The stability detector finished a full cache rebuild cycle.
+    ///
+    /// Emitted by `learning::stability_detector` (Phase 3) after writing
+    /// the new snapshot to `user_profile_facets`. Subscribers (Phase 4
+    /// `profile_md_renderer`) react to re-render the `PROFILE.md` managed
+    /// blocks.
+    CacheRebuilt {
+        /// Number of facets added in this cycle.
+        added: usize,
+        /// Number of facets evicted (below τ_evict threshold) in this cycle.
+        evicted: usize,
+        /// Number of facets unchanged / carried over.
+        kept: usize,
+        /// Total facets in the cache after the rebuild.
+        total_size: usize,
+        /// Wall-clock seconds since epoch when the rebuild completed.
+        rebuilt_at: f64,
+    },
+
     // ── System lifecycle ────────────────────────────────────────────────
     /// A system component started up.
     SystemStartup { component: String },
@@ -389,7 +438,10 @@ impl DomainEvent {
             | Self::MemoryRecalled { .. }
             | Self::MemorySyncRequested { .. }
             | Self::MemoryIngestionStarted { .. }
-            | Self::MemoryIngestionCompleted { .. } => "memory",
+            | Self::MemoryIngestionCompleted { .. }
+            | Self::DocumentCanonicalized { .. } => "memory",
+
+            Self::CacheRebuilt { .. } => "learning",
 
             Self::ChannelInboundMessage { .. }
             | Self::ChannelMessageReceived { .. }
 
@@ -415,6 +415,29 @@ fn all_variants_have_correct_domain() {
             },
             "system",
         ),
+        // Memory tree
+        (
+            DomainEvent::DocumentCanonicalized {
+                source_id: "gmail:abc".into(),
+                source_kind: "email".into(),
+                chunks_written: 3,
+                chunk_ids: vec!["c1".into(), "c2".into(), "c3".into()],
+                canonicalized_at: 1_700_000_000.0,
+                body_preview: Some("Thanks,\nAlice".into()),
+            },
+            "memory",
+        ),
+        // Learning
+        (
+            DomainEvent::CacheRebuilt {
+                added: 2,
+                evicted: 1,
+                kept: 5,
+                total_size: 7,
+                rebuilt_at: 1_700_000_000.0,
+            },
+            "learning",
+        ),
     ];
 
     for (event, expected_domain) in cases {
 
@@ -6,14 +6,19 @@
 //! 2. Manages conversation segments (boundary detection + lifecycle).
 //! 3. On segment close: extracts events (heuristic) and updates user profile.
 //! 4. Extracts simple lessons from tool failures.
+//! 5. (Phase 1 / #566) Pipes the turn into the memory tree as `conversations:agent`
+//!    when `config.learning.chat_to_tree_enabled` is true.
 
 use crate::openhuman::agent::hooks::{PostTurnHook, TurnContext};
+use crate::openhuman::config::Config;
 use crate::openhuman::memory::store::events::{self, EventRecord, EventType};
 use crate::openhuman::memory::store::fts5::{self, EpisodicEntry};
 use crate::openhuman::memory::store::profile::{self, FacetType};
 use crate::openhuman::memory::store::segments::{
     self, BoundaryConfig, BoundaryDecision, ConversationSegment,
 };
+use crate::openhuman::memory::tree::canonicalize::chat::{ChatBatch, ChatMessage};
+use crate::openhuman::memory::tree::ingest;
 use async_trait::async_trait;
 use parking_lot::Mutex;
 use rusqlite::Connection;
@@ -31,24 +36,43 @@ pub struct ArchivistHook {
     enabled: bool,
     /// Boundary detection configuration.
     boundary_config: BoundaryConfig,
+    /// Optional runtime config — used to gate the tree-ingest path.
+    ///
+    /// When `None`, the tree-ingest path is skipped. Set via
+    /// [`ArchivistHook::with_config`] on the production path.
+    config: Option<Config>,
 }
 
 impl ArchivistHook {
     /// Create an Archivist hook with a shared SQLite connection.
+    ///
+    /// Tree-ingest is disabled by default; call [`Self::with_config`] to
+    /// enable it on the production path.
     pub fn new(conn: Arc<Mutex<Connection>>, enabled: bool) -> Self {
         Self {
             conn: Some(conn),
             enabled,
             boundary_config: BoundaryConfig::default(),
+            config: None,
         }
     }
 
+    /// Attach runtime config so the archivist can gate the tree-ingest path.
+    ///
+    /// When `config.learning.chat_to_tree_enabled` is `true`, each completed
+    /// turn is also piped into the memory tree as `source="conversations:agent"`.
+    pub fn with_config(mut self, config: Config) -> Self {
+        self.config = Some(config);
+        self
+    }
+
     /// Create a disabled/no-op Archivist (when FTS5 is not enabled).
     pub fn disabled() -> Self {
         Self {
             conn: None,
             enabled: false,
             boundary_config: BoundaryConfig::default(),
+            config: None,
         }
     }
 
@@ -365,11 +389,173 @@ impl PostTurnHook for ArchivistHook {
             current_episodic_id,
         );
 
+        // ── Phase 1 / #566: pipe turn into the memory tree ───────────────────
+        // Gate: only when config is attached and chat_to_tree_enabled is true.
+        // Non-fatal: if tree-ingest fails, the episodic write already succeeded
+        // and the turn result is not affected.
+        if let Some(ref cfg) = self.config {
+            if cfg.learning.chat_to_tree_enabled {
+                tracing::debug!(
+                    "[archivist] piping turn into tree as conversations:agent session={}",
+                    session_id
+                );
+                self.pipe_turn_to_tree(cfg, ctx, session_id, timestamp)
+                    .await;
+            }
+        }
+
         tracing::debug!("[archivist] turn indexed successfully");
         Ok(())
     }
 }
 
+impl ArchivistHook {
+    /// Pipe the completed turn into the memory tree as `source="conversations:agent"`.
+    ///
+    /// Tool-call JSON is stripped from the assistant text before ingest — only
+    /// the assistant's prose response flows into the tree (memory ingestion
+    /// policy: tool outputs must not reach memory).
+    ///
+    /// Failures are logged and swallowed; the episodic write is the source of
+    /// truth.
+    async fn pipe_turn_to_tree(
+        &self,
+        config: &Config,
+        ctx: &TurnContext,
+        session_id: &str,
+        timestamp: f64,
+    ) {
+        use chrono::{TimeZone, Utc};
+
+        // Build turn timestamps. The assistant message is offset by 1ms as in
+        // the episodic write so ordering is stable.
+        let user_ts = Utc
+            .timestamp_opt(
+                timestamp as i64,
+                ((timestamp.fract() * 1e9) as u32).min(999_999_999),
+            )
+            .single()
+            .unwrap_or_else(Utc::now);
+        let asst_ts = Utc
+            .timestamp_opt(
+                (timestamp + 0.001) as i64,
+                (((timestamp + 0.001).fract() * 1e9) as u32).min(999_999_999),
+            )
+            .single()
+            .unwrap_or(user_ts);
+
+        // Strip tool-call JSON from the assistant response.
+        // Per memory ingestion policy, structured tool-call payloads must not
+        // flow into the tree — only the prose response is ingested.
+        let assistant_prose = strip_tool_calls_from_response(&ctx.assistant_response);
+
+        let batch = ChatBatch {
+            platform: "agent".into(),
+            channel_label: session_id.to_string(),
+            messages: vec![
+                ChatMessage {
+                    author: "user".into(),
+                    timestamp: user_ts,
+                    text: ctx.user_message.clone(),
+                    source_ref: Some(format!("agent://session/{session_id}")),
+                },
+                ChatMessage {
+                    author: "assistant".into(),
+                    timestamp: asst_ts,
+                    text: assistant_prose,
+                    source_ref: Some(format!("agent://session/{session_id}")),
+                },
+            ],
+        };
+
+        // Use the session_id as the owner / identity tag.
+        let source_id = "conversations:agent";
+        let owner = session_id;
+        let tags = vec!["agent_chat".to_string()];
+
+        match ingest::ingest_chat(config, source_id, owner, tags, batch).await {
+            Ok(result) => {
+                tracing::debug!(
+                    "[archivist] tree ingest ok: source_id={} chunks_written={} session={}",
+                    source_id,
+                    result.chunks_written,
+                    session_id
+                );
+            }
+            Err(e) => {
+                tracing::warn!(
+                    "[archivist] tree ingest failed (non-fatal): source_id={} session={} error={e}",
+                    source_id,
+                    session_id
+                );
+            }
+        }
+    }
+}
+
+/// Strip tool-call JSON blocks from an assistant response, leaving only the
+/// prose text.
+///
+/// The archivist stores the full response (including `tool_calls_json`) in
+/// the episodic log for diagnostic purposes. However, per the memory
+/// ingestion policy, structured tool-call payloads must not reach the memory
+/// tree — only the assistant's natural-language prose is ingested.
+///
+/// This function applies a lightweight heuristic: it removes any contiguous
+/// spans of text that look like `<tool_call>…</tool_call>` XML/JSON blocks or
+/// raw JSON objects that begin with `{"tool_calls":`. The output may be empty
+/// if the entire response was tool-call markup — callers should handle that
+/// case (empty text → no-op ingest).
+fn strip_tool_calls_from_response(response: &str) -> String {
+    // Fast path: if the response contains no obvious tool-call markers, return
+    // it unchanged to avoid unnecessary allocation.
+    if !response.contains("<tool_call>")
+        && !response.contains("{\"tool_calls\"")
+        && !response.contains("\"tool_use\"")
+    {
+        return response.to_string();
+    }
+
+    // Remove XML-style tool-call blocks.
+    let mut cleaned = response.to_string();
+
+    // Strip <tool_call>…</tool_call> spans (may span multiple lines).
+    while let Some(start) = cleaned.find("<tool_call>") {
+        if let Some(end) = cleaned[start..].find("</tool_call>") {
+            cleaned.drain(start..start + end + "</tool_call>".len());
+        } else {
+            // Unclosed tag — remove from the tag to end of string.
+            cleaned.truncate(start);
+            break;
+        }
+    }
+
+    // Trim and collapse runs of blank lines left by block removal.
+    let trimmed = cleaned
+        .lines()
+        .map(str::trim_end)
+        .collect::<Vec<_>>()
+        .join("\n");
+
+    // Collapse more than two consecutive newlines to two.
+    let mut result = String::with_capacity(trimmed.len());
+    let mut blank_run = 0usize;
+    for line in trimmed.lines() {
+        if line.is_empty() {
+            blank_run += 1;
+            if blank_run <= 2 {
+                result.push('\n');
+            }
+        } else {
+            blank_run = 0;
+            result.push_str(line);
+            result.push('\n');
+        }
+    }
+
+    result.trim().to_string()
+}
+
 /// Extract simple lessons from tool call outcomes (no LLM needed).
 fn extract_lesson_from_tools(
     tool_calls: &[crate::openhuman::agent::hooks::ToolCallRecord],
 
@@ -762,6 +762,8 @@ impl Agent {
                 .add_section(Box::new(
                     crate::openhuman::learning::UserProfileSection::new(memory.clone()),
                 ));
+            // NOTE: MemoryAccessSection is added after tool-filtering so we can
+            // gate it on retrieval-tool visibility — see below.
             log::info!(
                 "[learning] prompt sections registered (user_reflections, learned_context, user_profile)"
             );
@@ -928,6 +930,35 @@ impl Agent {
                 .map(|t| t.name().to_string())
                 .collect(),
         };
+
+        // Phase 4 (#566): add the MemoryAccessSection bias instruction only
+        // when at least one retrieval tool is actually loaded AND survives
+        // filtering. We require both because:
+        //   - the tool may be filtered out by the agent's scope config
+        //   - the tool may not be registered at all on this agent (tool
+        //     listing is build-time configurable)
+        // An empty `visible` set means "no filter" (wildcard / orchestrator
+        // path); in that case any registered retrieval tool is reachable.
+        if config.learning.enabled {
+            let recall_tools = ["memory_recall", "memory_search"];
+            let has_retrieval = recall_tools.iter().any(|name| {
+                let registered = tools.iter().any(|t| t.name() == *name)
+                    || delegation_tools.iter().any(|t| t.name() == *name);
+                let allowed_by_filter = visible.is_empty() || visible.contains(*name);
+                registered && allowed_by_filter
+            });
+            if has_retrieval {
+                prompt_builder = prompt_builder
+                    .add_section(Box::new(crate::openhuman::learning::MemoryAccessSection));
+                log::debug!("[learning] memory_access prompt section registered");
+            } else {
+                log::debug!(
+                    "[learning] skipping MemoryAccessSection — neither memory_recall nor \
+                     memory_search is registered+visible for agent={agent_id}"
+                );
+            }
+        }
+
         // De-duplicate: some synthesised tool names may collide with
         // already-registered tools (unlikely for `delegate_*` names but
         // cheap to guard against).