fix(memory_tree, e2e tests ): deterministic query_topic ordering + robust CEF cleanup (tinyhumansai#1751)

sanil-23 · claude · web-flow · commit dd30a4b0298c · 2026-05-14T11:58:18.000-07:00
Co-authored-by: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/app/scripts/e2e-run-session.sh b/app/scripts/e2e-run-session.sh
@@ -63,11 +63,40 @@ cleanup() {
   fi
   if [ -n "$APP_PID" ]; then
     echo "[runner] Stopping CEF app (pid $APP_PID)..."
+    # CEF spawns helper child processes (zygote, GPU, renderers) that
+    # the parent does not reap on SIGTERM. If we only `kill $APP_PID`
+    # the parent exits but children keep writing into the temp
+    # workspace, and the `rm -rf` below races them and fails with
+    # "Directory not empty" on Linux runners — even though the WDIO
+    # spec itself passed. Reap the whole process tree before cleanup.
+    #
+    # CRITICAL: capture child PIDs **before** killing the parent.
+    # The instant the parent exits, the kernel reparents its children
+    # to init (PID 1). After that, `pkill -P "$APP_PID"` matches
+    # nothing because no process has the dying parent as its PPID
+    # anymore. Snapshot the PIDs while the relationship still exists,
+    # then signal them directly by PID.
+    CHILD_PIDS="$(pgrep -P "$APP_PID" 2>/dev/null || true)"
+    pkill -TERM -P "$APP_PID" 2>/dev/null || true
     kill "$APP_PID" 2>/dev/null || true
     wait "$APP_PID" 2>/dev/null || true
+    # Brief grace period so CEF helpers can flush their CEF/Default
+    # files and exit on the SIGTERM we already sent. Anything that
+    # ignored it gets SIGKILLed by the captured-PID sweep below.
+    sleep 1
+    if [ -n "$CHILD_PIDS" ]; then
+      for pid in $CHILD_PIDS; do
+        kill -KILL "$pid" 2>/dev/null || true
+      done
+    fi
   fi
   if [ -n "$CREATED_TEMP_WORKSPACE" ]; then
-    rm -rf "$CREATED_TEMP_WORKSPACE"
+    # Tolerate transient races: even after the kill above, a CEF helper
+    # may still be flushing CEF/Default/* on a slow Linux runner. The
+    # workspace is a per-run mktemp under /tmp; anything left behind is
+    # collected by the next CI tmp-cleanup pass. We must not fail the
+    # whole job on cleanup leftovers when the test itself passed.
+    rm -rf "$CREATED_TEMP_WORKSPACE" 2>/dev/null || true
   fi
   if [ -n "$E2E_CONFIG_BACKUP" ] && [ -f "$E2E_CONFIG_BACKUP" ]; then
     mv "$E2E_CONFIG_BACKUP" "$E2E_CONFIG_FILE"
diff --git a/src/openhuman/memory/tree/retrieval/topic.rs b/src/openhuman/memory/tree/retrieval/topic.rs
@@ -88,10 +88,20 @@ pub async fn query_topic(
     // `total` and waste result slots. For duplicates, keep the higher
     // score; if scores tie, prefer the newer `time_range_end`.
     // Flagged on PR #831 CodeRabbit review.
-    use std::collections::HashMap;
-    let mut by_node: HashMap<String, RetrievalHit> = HashMap::new();
-
-    let merge = |map: &mut HashMap<String, RetrievalHit>, hit: RetrievalHit| {
+    //
+    // `BTreeMap` (not `HashMap`) so the post-dedup iteration order is a
+    // deterministic function of `node_id`. The downstream sort is
+    // stable, so when many hits tie on `(score, time_range_end)` —
+    // which is common with the inert embedder used in tests and with
+    // freshly-ingested workspaces where score normalisation hasn't run
+    // — the surviving order falls back to alphabetical `node_id`
+    // instead of `HashMap`'s randomised SipHash iteration. Without
+    // this, `tests/agent_retrieval_e2e.rs::orchestrator_query_topic…`
+    // picked a different "first leaf hit" on each run.
+    use std::collections::BTreeMap;
+    let mut by_node: BTreeMap<String, RetrievalHit> = BTreeMap::new();
+
+    let merge = |map: &mut BTreeMap<String, RetrievalHit>, hit: RetrievalHit| {
         map.entry(hit.node_id.clone())
             .and_modify(|existing| {
                 let better = match hit
@@ -130,12 +140,17 @@ pub async fn query_topic(
         rerank_by_semantic_similarity(config, q, hits).await?
     } else {
         let mut by_score = hits;
-        // Sort: score DESC, then newest first on ties.
+        // Sort: score DESC, then newest first on ties, then `node_id`
+        // ASC as a final tie-break so two hits that match on every
+        // ranked dimension still produce a deterministic order across
+        // runs (matters with the inert embedder used in tests, where
+        // every score lands at 0.0 and only the `node_id` differs).
         by_score.sort_by(|a, b| {
             b.score
                 .partial_cmp(&a.score)
                 .unwrap_or(std::cmp::Ordering::Equal)
                 .then_with(|| b.time_range_end.cmp(&a.time_range_end))
+                .then_with(|| a.node_id.cmp(&b.node_id))
         });
         by_score
     };
diff --git a/tests/agent_retrieval_e2e.rs b/tests/agent_retrieval_e2e.rs
@@ -54,16 +54,37 @@ fn test_config() -> (TempDir, Config) {
 
 // ── RAII env guard shared by all tests in this file ──────────────────────────
 
+/// Process-wide mutex that serialises every test in this binary that
+/// mutates `OPENHUMAN_WORKSPACE`. Cargo runs integration-test binaries
+/// multi-threaded by default (`test-threads = num_cpus`), so without
+/// this serialisation two tests would race on the env var: test A sets
+/// it to `/tmp/aaa`, test B overwrites it with `/tmp/bbb`, then when
+/// B's `TempDir` drops it unlinks `/tmp/bbb` while A is still reading
+/// from it. That race surfaced in CI as `SQLITE_IOERR_FSTAT` (error
+/// code 1802) during a later `with_connection` call on the now-deleted
+/// path, and earlier as `fetch_leaves` returning 0 hits when the
+/// resolved workspace temporarily pointed at the wrong sibling test's
+/// (otherwise empty) tempdir.
+///
+/// `unwrap_or_else(|p| p.into_inner())` keeps the lock usable after a
+/// poisoning panic so one failing test never cascades.
+static ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());
+
 struct EnvGuard {
     key: &'static str,
     prev: Option<std::ffi::OsString>,
+    /// Last field — dropped after `Drop::drop` has already restored
+    /// the env var, so the next test acquires the lock against a
+    /// clean `OPENHUMAN_WORKSPACE` value.
+    _lock: std::sync::MutexGuard<'static, ()>,
 }
 
 impl Drop for EnvGuard {
     fn drop(&mut self) {
         // SAFETY: cargo test runs each integration test binary in its own
-        // process; nothing else in this bin mutates these env vars, and the
-        // guard restores the previous value on drop.
+        // process; the `ENV_LOCK` mutex held in `_lock` serialises all
+        // mutations within this binary, and the guard restores the
+        // previous value before the lock is released.
         unsafe {
             match self.prev.take() {
                 Some(v) => std::env::set_var(self.key, v),
@@ -77,13 +98,19 @@ impl Drop for EnvGuard {
 /// restores the previous value on drop. This makes the tool wrappers (which
 /// call `load_config_with_timeout` internally) resolve to the same workspace
 /// that was used for ingest.
+///
+/// The returned guard also holds [`ENV_LOCK`] for its lifetime, so concurrent
+/// tests in the same binary cannot stomp on each other's
+/// `OPENHUMAN_WORKSPACE` setting.
 fn set_workspace_env(tmp: &TempDir) -> EnvGuard {
+    let lock = ENV_LOCK.lock().unwrap_or_else(|p| p.into_inner());
     let prev = std::env::var_os("OPENHUMAN_WORKSPACE");
     // SAFETY: see EnvGuard::Drop above.
     unsafe { std::env::set_var("OPENHUMAN_WORKSPACE", tmp.path()) };
     EnvGuard {
         key: "OPENHUMAN_WORKSPACE",
         prev,
+        _lock: lock,
     }
 }