Merge pull request #78 from FluffyAIcode/AgentMemory/v04-pr-k1g-memory-tracking-8e7f

FluffyAIcode · web-flow · commit b2727654313a · 2026-06-09T08:28:15.000+08:00
PR-K1.G: memory usage tracking for K1.E NIAH validation (stacked on #77)
diff --git a/inference_engine/v04/__init__.py b/inference_engine/v04/__init__.py
@@ -49,12 +49,15 @@
     NIAHSample,
     aggregate_recall,
     evaluate,
+    format_memory_summary,
     greedy_decode_oracle,
     greedy_decode_sink_window,
     greedy_decode_v04,
     make_niah_dataset,
     make_sink_window_4d_mask,
     recall_predicate,
+    record_memory,
+    reset_memory_peak,
 )
 
 __all__ = [
@@ -83,4 +86,8 @@
     "make_niah_dataset",
     "make_sink_window_4d_mask",
     "recall_predicate",
+    # K1.G — memory tracking
+    "format_memory_summary",
+    "record_memory",
+    "reset_memory_peak",
 ]
diff --git a/inference_engine/v04/niah_eval.py b/inference_engine/v04/niah_eval.py
@@ -42,7 +42,7 @@
 import math
 import random
 import time
-from typing import Callable, List, Optional, Sequence, Tuple
+from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple
 
 import torch
 import torch.nn as nn
@@ -497,3 +497,173 @@ def evaluate(
         decoded_texts.append(text)
         latencies_s.append(latency)
     return aggregate_recall(name, samples, decoded_texts, latencies_s)
+
+
+# ---------------------------------------------------------------------------
+# Memory measurement helpers
+# ---------------------------------------------------------------------------
+#
+# ADR 0008 §11.5 §"Five properties" item 1 — "constant memory in
+# context length" — is a measurable claim, not a presumption. The
+# helpers below let runners record per-config peak / current memory
+# on the active device and emit it into the run's JSON evidence so
+# the constant-memory claim becomes empirically verifiable rather
+# than rhetorical.
+#
+# CUDA: torch.cuda.max_memory_allocated tracks the high-water mark
+# since the last reset. Reset before each config evaluation, sample
+# after, and the peak is the config's memory cost.
+#
+# MPS: torch.mps does not expose a peak counter as of torch 2.x, so
+# we record current_allocated and driver_allocated as point-in-time
+# samples. Mac runs cannot demonstrate the sustained-memory claim
+# with the same precision as CUDA runs but they can still show
+# rough magnitudes.
+#
+# CPU: optional dependency on psutil. If present, RSS is recorded;
+# if absent, memory fields are None and the run continues. Tests
+# pass psutil-less to verify graceful degradation.
+
+
+def reset_memory_peak(device: torch.device) -> None:
+    """Reset the device's peak-memory counter so a subsequent
+    :func:`record_memory` capture reflects only the period after
+    this call.
+
+    Idempotent. Safe to call on devices that don't track peaks
+    (MPS, CPU); the call is a no-op there.
+    """
+    if device.type == "cuda":
+        torch.cuda.synchronize(device)
+        torch.cuda.empty_cache()
+        torch.cuda.reset_peak_memory_stats(device)
+    elif device.type == "mps":
+        # No-op: torch.mps does not expose reset_peak_memory_stats
+        # in the current torch line. Documented limitation; the
+        # MPS branch reports point-in-time allocations only.
+        pass
+    # CPU path: nothing to reset; RSS is process-level and we
+    # baseline against a "before" snapshot in record_memory if
+    # the caller wants per-config delta.
+
+
+def record_memory(device: torch.device) -> Dict[str, Any]:
+    """Capture a memory snapshot on the given device.
+
+    Returns a dict whose shape depends on the device kind:
+
+    * **cuda**: ``{
+        "device_kind": "cuda",
+        "current_allocated_bytes": int,
+        "current_reserved_bytes": int,
+        "peak_allocated_bytes": int,        # since last reset
+        "peak_reserved_bytes": int,         # since last reset
+        "device_total_bytes": int,
+      }``
+    * **mps**: ``{
+        "device_kind": "mps",
+        "current_allocated_bytes": int,
+        "driver_allocated_bytes": int,
+        "peak_allocated_bytes": None,       # not exposed on MPS
+        "peak_reserved_bytes": None,
+        "device_total_bytes": None,
+      }``
+    * **cpu**: ``{
+        "device_kind": "cpu",
+        "current_allocated_bytes": int|None,  # process RSS via psutil
+        "peak_allocated_bytes": None,
+        ...
+      }``
+
+    All bytes fields are ``int`` when measurable, ``None`` when the
+    device kind doesn't expose that metric. JSON-serialisable.
+
+    Synchronizes the CUDA stream before sampling so async kernels
+    have committed; MPS path doesn't currently expose a sync API for
+    memory accounting (kernels are typically already complete when
+    the eval loop is between samples).
+    """
+    if device.type == "cuda":
+        torch.cuda.synchronize(device)
+        props = torch.cuda.get_device_properties(device)
+        return {
+            "device_kind": "cuda",
+            "device_name": props.name,
+            "device_total_bytes": int(props.total_memory),
+            "current_allocated_bytes": int(torch.cuda.memory_allocated(device)),
+            "current_reserved_bytes": int(torch.cuda.memory_reserved(device)),
+            "peak_allocated_bytes": int(torch.cuda.max_memory_allocated(device)),
+            "peak_reserved_bytes": int(torch.cuda.max_memory_reserved(device)),
+        }
+    if device.type == "mps":
+        # torch.mps.current_allocated_memory and
+        # torch.mps.driver_allocated_memory are stable since torch 2.0.
+        try:
+            current = int(torch.mps.current_allocated_memory())
+        except Exception:
+            current = None
+        try:
+            driver = int(torch.mps.driver_allocated_memory())
+        except Exception:
+            driver = None
+        return {
+            "device_kind": "mps",
+            "device_name": "Apple MPS",
+            "device_total_bytes": None,
+            "current_allocated_bytes": current,
+            "driver_allocated_bytes": driver,
+            "peak_allocated_bytes": None,
+            "peak_reserved_bytes": None,
+        }
+    # CPU or other: try psutil for process RSS.
+    rss: Optional[int] = None
+    try:
+        import psutil  # type: ignore
+        rss = int(psutil.Process().memory_info().rss)
+    except Exception:
+        rss = None
+    return {
+        "device_kind": device.type,
+        "device_name": str(device),
+        "device_total_bytes": None,
+        "current_allocated_bytes": rss,
+        "peak_allocated_bytes": None,
+        "peak_reserved_bytes": None,
+    }
+
+
+def format_memory_summary(snapshot: Dict[str, Any]) -> str:
+    """Return a one-line human-readable summary of a memory snapshot.
+
+    Used by runners to print per-config memory at the same density
+    as the latency / recall summary lines. Returns a string suitable
+    for direct ``print()``-ing; callers prepend their own prefix.
+    """
+    kind = snapshot.get("device_kind", "?")
+    if kind == "cuda":
+        peak = snapshot.get("peak_allocated_bytes")
+        cur = snapshot.get("current_allocated_bytes")
+        total = snapshot.get("device_total_bytes")
+        if peak is not None and total is not None and total > 0:
+            pct = peak / total * 100
+            return (
+                f"cuda peak={peak / 1e9:.2f}GB ({pct:.0f}% of "
+                f"{total / 1e9:.0f}GB)  current={cur / 1e9:.2f}GB"
+            )
+        return f"cuda peak={peak} current={cur}"
+    if kind == "mps":
+        cur = snapshot.get("current_allocated_bytes")
+        drv = snapshot.get("driver_allocated_bytes")
+        if cur is not None:
+            cur_str = f"{cur / 1e9:.2f}GB"
+        else:
+            cur_str = "n/a"
+        if drv is not None:
+            drv_str = f"{drv / 1e9:.2f}GB"
+        else:
+            drv_str = "n/a"
+        return f"mps current={cur_str} driver={drv_str} (no peak counter)"
+    cur = snapshot.get("current_allocated_bytes")
+    if cur is not None:
+        return f"cpu rss={cur / 1e9:.2f}GB"
+    return f"{kind} (no memory accounting available)"
diff --git a/scripts/research/k1e_niah_validation.py b/scripts/research/k1e_niah_validation.py
@@ -137,10 +137,13 @@ def main() -> int:
         DLMRestoredVerifier,
         NIAHEvalResult,
         evaluate,
+        format_memory_summary,
         greedy_decode_oracle,
         greedy_decode_sink_window,
         greedy_decode_v04,
         make_niah_dataset,
+        record_memory,
+        reset_memory_peak,
     )
 
     samples = make_niah_dataset(
@@ -173,7 +176,22 @@ def encode_chat(prompt_text: str) -> torch.Tensor:
         file=sys.stderr,
     )
 
+    # K1.G: baseline memory snapshot. Captured BEFORE any config
+    # runs, after model + tokenizer + dataset are loaded — represents
+    # the minimum sustained working set for this run. Per-config
+    # peak is reported relative to this baseline so the
+    # constant-memory claim of ADR 0008 §11.5 §"Five properties"
+    # item 1 is empirically verifiable from the JSON evidence.
+    reset_memory_peak(device)
+    baseline_memory = record_memory(device)
+    print(
+        f"[k1e] baseline memory after model+dataset load: "
+        f"{format_memory_summary(baseline_memory)}",
+        file=sys.stderr,
+    )
+
     results = {}
+    memory_per_config = {}
 
     # ----------------------------------------------------------------
     # (a) full-attention oracle
@@ -191,14 +209,21 @@ def oracle_decode(sample) -> Tuple[str, float]:
             )
             return text, time.perf_counter() - t0
 
+        reset_memory_peak(device)
         oracle = evaluate("oracle_full_attention", samples, oracle_decode)
+        oracle_memory = record_memory(device)
         results["oracle_full_attention"] = _result_to_dict(oracle)
+        memory_per_config["oracle_full_attention"] = oracle_memory
         print(
             f"[k1e]    oracle recall={oracle.recall:.3f} "
             f"({oracle.samples_correct}/{oracle.samples_total})  "
             f"mean_latency={oracle.mean_latency_s:.2f}s",
             file=sys.stderr,
         )
+        print(
+            f"[k1e]    oracle memory:  {format_memory_summary(oracle_memory)}",
+            file=sys.stderr,
+        )
 
     # ----------------------------------------------------------------
     # (b) v0.3 sink+window baseline
@@ -221,14 +246,21 @@ def v03_decode(sample) -> Tuple[str, float]:
             )
             return text, time.perf_counter() - t0
 
+        reset_memory_peak(device)
         v03 = evaluate("v03_sink_window", samples, v03_decode)
+        v03_memory = record_memory(device)
         results["v03_sink_window"] = _result_to_dict(v03)
+        memory_per_config["v03_sink_window"] = v03_memory
         print(
             f"[k1e]    v0.3 recall={v03.recall:.3f} "
             f"({v03.samples_correct}/{v03.samples_total})  "
             f"mean_latency={v03.mean_latency_s:.2f}s",
             file=sys.stderr,
         )
+        print(
+            f"[k1e]    v0.3 memory:   {format_memory_summary(v03_memory)}",
+            file=sys.stderr,
+        )
 
     # ----------------------------------------------------------------
     # (c) v0.4 DLMRestoredVerifier
@@ -255,14 +287,21 @@ def v04_decode(sample) -> Tuple[str, float]:
             )
             return text, time.perf_counter() - t0
 
+        reset_memory_peak(device)
         v04 = evaluate("v04_dlm_restored", samples, v04_decode)
+        v04_memory = record_memory(device)
         results["v04_dlm_restored"] = _result_to_dict(v04)
+        memory_per_config["v04_dlm_restored"] = v04_memory
         print(
             f"[k1e]    v0.4 recall={v04.recall:.3f} "
             f"({v04.samples_correct}/{v04.samples_total})  "
             f"mean_latency={v04.mean_latency_s:.2f}s",
             file=sys.stderr,
         )
+        print(
+            f"[k1e]    v0.4 memory:   {format_memory_summary(v04_memory)}",
+            file=sys.stderr,
+        )
 
     # ----------------------------------------------------------------
     # Gate evaluation (only meaningful if both oracle and v04 ran)
@@ -283,7 +322,9 @@ def v04_decode(sample) -> Tuple[str, float]:
             gate["v04_dominates_v03"] = v04_recall > v03_recall
 
     report = {
-        "schema_version": 1,
+        # schema v2: K1.G adds 'baseline_memory' and 'memory_per_config'.
+        # v1 consumers must default the memory blocks to {} on read.
+        "schema_version": 2,
         "kind": "k1e_niah_validation",
         "config": {
             "model": args.model,
@@ -302,6 +343,10 @@ def v04_decode(sample) -> Tuple[str, float]:
             "prompt_token_len_mean": sum(seq_lens) // len(seq_lens),
         },
         "results": results,
+        "memory": {
+            "baseline": baseline_memory,
+            "per_config": memory_per_config,
+        },
         "gate": gate,
     }
 
@@ -316,9 +361,15 @@ def v04_decode(sample) -> Tuple[str, float]:
     # Top-line summary
     print("[k1e] ─── SUMMARY ──────────────────────────────────────", file=sys.stderr)
     for name, r in results.items():
+        mem = memory_per_config.get(name, {})
+        mem_str = ""
+        if mem.get("device_kind") == "cuda" and mem.get("peak_allocated_bytes") is not None:
+            mem_str = f"  peak_mem={mem['peak_allocated_bytes'] / 1e9:.2f}GB"
+        elif mem.get("device_kind") == "mps" and mem.get("current_allocated_bytes") is not None:
+            mem_str = f"  current_mem={mem['current_allocated_bytes'] / 1e9:.2f}GB"
         print(
             f"[k1e]   {name:<24s}  recall={r['recall']:.3f}  "
-            f"mean_latency={r['mean_latency_s']:.2f}s",
+            f"mean_latency={r['mean_latency_s']:.2f}s{mem_str}",
             file=sys.stderr,
         )
     if gate:
diff --git a/tests/inference_engine/v04/test_niah_eval.py b/tests/inference_engine/v04/test_niah_eval.py