ChingEnLin
diff --git a/‎backend/main.py‎
Lines changed: 21 additions & 0 deletions b/‎backend/main.py‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎backend/queryargus‎ b/‎backend/queryargus‎
diff --git a/‎backend/routes/argus.py‎
Lines changed: 59 additions & 3 deletions b/‎backend/routes/argus.py‎
Lines changed: 59 additions & 3 deletions
diff --git a/‎backend/services/argus_live_events.py‎
Lines changed: 155 additions & 0 deletions b/‎backend/services/argus_live_events.py‎
Lines changed: 155 additions & 0 deletions
@@ -1,16 +1,37 @@
+import logging
 import os
 from contextlib import asynccontextmanager
 
 import uvicorn
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
+from queryargus.observability.logging_observer import JsonFormatter
 from routes import query, azure, system, user_queries, data_documents, audit, argus
 
 
+def _install_argus_log_handler() -> None:
+    """Route the ``queryargus.run`` logger to stderr as one JSON line per event.
+
+    Called from lifespan rather than at module import so test runners (which
+    rely on ``caplog`` propagating records to the root logger) are unaffected
+    when something imports ``main`` during collection.
+    """
+    logger = logging.getLogger("queryargus.run")
+    if any(getattr(h, "_querypal_argus", False) for h in logger.handlers):
+        return  # idempotent — lifespan may run twice under some test harnesses
+    handler = logging.StreamHandler()
+    handler.setFormatter(JsonFormatter())
+    handler._querypal_argus = True  # type: ignore[attr-defined]
+    logger.addHandler(handler)
+    logger.setLevel(logging.INFO)
+    logger.propagate = False
+
+
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     from services.argus_store import get_report_store
 
+    _install_argus_log_handler()
     get_report_store()
     yield
 
 
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import asyncio
+import json
 import logging
 import uuid
 from datetime import datetime, timezone
@@ -22,6 +23,9 @@
 from queryargus.models.connection import CosmosConnection
 from queryargus.models.finding import Finding
 from queryargus.models.report import AuditReport
+from queryargus.observability.cost import CostTracker
+from queryargus.observability.logging_observer import StructuredLogObserver
+from services.argus_live_events import LiveEventBuffer
 from services.argus_profiles_service import (
     ProfileNameConflict,
     create_profile,
@@ -71,6 +75,11 @@
 _JOBS: dict[str, dict[str, Any]] = {}
 _MAX_JOBS = 50
 
+# StructuredLogObserver carries no per-run state besides the current run_id
+# (re-set in on_run_start), so a module-level singleton is safe across requests.
+# CostTracker must be constructed per-request — it accumulates token buckets.
+_LOG_OBSERVER = StructuredLogObserver()
+
 
 class AuditRequest(BaseModel):
     account_id: str
@@ -97,17 +106,26 @@ def _summary(description: str) -> str:
 
 
 def _finding_trace(report: AuditReport, finding: Finding) -> str:
+    """One JSON object per relevant agent step (JSONL).
+
+    The frontend parses each line and renders a structured block. Falls back
+    gracefully to plain-text display if a line fails to parse.
+    """
     field = finding.field
     lines: list[str] = []
     for i, action in enumerate(report.run_trace, start=1):
         inp_repr = repr(action.action_input)
         is_write = action.action == "write_finding" and field in inp_repr
         if field not in inp_repr and not is_write:
             continue
-        lines.append(f"iter {i} · {action.action}")
-        lines.append(f"reason: {action.reasoning}")
+        entry: dict[str, Any] = {
+            "iter": i,
+            "action": action.action,
+            "reason": action.reasoning,
+        }
         if is_write:
-            lines.append("finding gate: PASS")
+            entry["gate"] = "PASS"
+        lines.append(json.dumps(entry, ensure_ascii=False, default=str))
     return "\n".join(lines)
 
 
@@ -235,6 +253,9 @@ def _serialize_report(
         "counts": counts,
         "diff": diff_counts,
         "findings": findings,
+        "cost": (
+            report.cost.model_dump(mode="json") if report.cost is not None else None
+        ),
         "created_by": created_by,
         "history": None,
     }
@@ -362,11 +383,14 @@ async def _execute_job(
             else:
                 judge_model_name = jm
             judge_llm = GeminiClient(model=judge_model_name)
+        live = LiveEventBuffer()
+        job["live"] = live
         agent = ArgusAgent.from_config(
             config=config,
             llm=llm,
             judge_llm=judge_llm,
             judge_model_name=judge_model_name,
+            observers=[_LOG_OBSERVER, CostTracker(), live],
         )
 
         history = None
@@ -500,6 +524,38 @@ async def get_run(job_id: str, authorization: str = Header(...)):
     )
 
 
+@router.get("/runs/{job_id}/events")
+async def get_run_events(
+    job_id: str,
+    authorization: str = Header(...),
+    cursor: int = Query(default=0, ge=0),
+):
+    """Live event snapshot for a still-running job.
+
+    `cursor` is the value returned in the previous poll's `next_cursor`. The
+    response also carries rolled-up aggregates (current_iter, findings_count,
+    running token totals, last_action / last_tool) so the UI can render
+    progress without re-folding the event stream.
+
+    Reuses the same caller-scoping rule as `get_run`: cross-tenant attempts
+    return 404, not 403, to avoid leaking job existence.
+    """
+    if not authorization.startswith("Bearer "):
+        raise HTTPException(status_code=401, detail="Invalid token format")
+    caller_email = extract_email_from_token(authorization[7:])
+    job = _JOBS.get(job_id)
+    if job is None:
+        raise HTTPException(status_code=404, detail="Job not found")
+    if job.get("created_by") and job["created_by"] != caller_email:
+        raise HTTPException(status_code=404, detail="Job not found")
+    live: Optional[LiveEventBuffer] = job.get("live")
+    if live is None:
+        # Run hasn't reached the observer-attach point yet (still waiting on
+        # Azure auth / connection-string), or job pre-dates this feature.
+        return JSONResponse(content={"events": [], "next_cursor": 0, "aggregates": {}})
+    return JSONResponse(content=live.snapshot(since=cursor))
+
+
 @router.get("/runs")
 async def list_runs(
     authorization: str = Header(...),
 
@@ -0,0 +1,155 @@
+"""Per-job in-memory buffer of structured agent events for live progress polling.
+
+Attached as a third observer alongside StructuredLogObserver + CostTracker on
+each Argus run. Mirrors StructuredLogObserver's event shapes so the frontend
+can render one timeline regardless of source.
+
+Thread-safe: ArgusAgent.run is invoked via run_in_threadpool, so the FastAPI
+request loop reads `snapshot()` from one thread while the worker thread writes
+events from another.
+
+Bounded by a ring buffer so a runaway agent cannot OOM the process; the live
+view is for progress, not the system of record (the persisted AuditReport is).
+"""
+
+from __future__ import annotations
+
+from collections import deque
+from datetime import datetime, timezone
+from threading import Lock
+from typing import Any, Literal
+from uuid import UUID
+
+from queryargus.llm.client import TokenUsage
+from queryargus.models.action import AgentAction
+from queryargus.models.finding import Finding
+from queryargus.models.report import AuditReport
+
+_MAX_EVENTS_PER_JOB = 500
+
+
+def _ts() -> str:
+    return datetime.now(timezone.utc).isoformat()
+
+
+class LiveEventBuffer:
+    def __init__(self) -> None:
+        self._events: deque[dict[str, Any]] = deque(maxlen=_MAX_EVENTS_PER_JOB)
+        self._lock = Lock()
+        self._run_id: str | None = None
+        self.current_iter = 0
+        self.findings_count = 0
+        self.input_tokens = 0
+        self.output_tokens = 0
+        self.last_action: str | None = None
+        self.last_tool: str | None = None
+        self.tool_errors = 0
+
+    def _push(self, event: str, /, **extras: object) -> None:
+        with self._lock:
+            self._events.append(
+                {"event": event, "run_id": self._run_id, "ts": _ts(), **extras}
+            )
+
+    def snapshot(self, since: int = 0) -> dict[str, Any]:
+        with self._lock:
+            evs = list(self._events)
+            aggregates = {
+                "current_iter": self.current_iter,
+                "findings_count": self.findings_count,
+                "input_tokens": self.input_tokens,
+                "output_tokens": self.output_tokens,
+                "last_action": self.last_action,
+                "last_tool": self.last_tool,
+                "tool_errors": self.tool_errors,
+            }
+        tail = evs[since:] if since < len(evs) else []
+        return {"events": tail, "next_cursor": len(evs), "aggregates": aggregates}
+
+    # RunObserver protocol -------------------------------------------------
+
+    def on_run_start(self, *, run_id: UUID, collection: str) -> None:
+        self._run_id = str(run_id)
+        self._push("run_start", collection=collection)
+
+    def on_iteration_start(self, *, iter: int) -> None:
+        with self._lock:
+            self.current_iter = iter
+        self._push("iteration_start", iter=iter)
+
+    def on_llm_call(
+        self,
+        *,
+        purpose: Literal["propose_action", "self_eval", "judge"],
+        model: str,
+        usage: TokenUsage,
+        latency_ms: int,
+    ) -> None:
+        with self._lock:
+            self.input_tokens += usage.input_tokens
+            self.output_tokens += usage.output_tokens
+        self._push(
+            "llm_call",
+            purpose=purpose,
+            model=model,
+            input_tokens=usage.input_tokens,
+            output_tokens=usage.output_tokens,
+            latency_ms=latency_ms,
+        )
+
+    def on_tool_call(
+        self,
+        *,
+        name: str,
+        args_summary: str,
+        ok: bool,
+        latency_ms: int,
+        error: str | None,
+    ) -> None:
+        with self._lock:
+            self.last_tool = name
+            if not ok:
+                self.tool_errors += 1
+        self._push(
+            "tool_call",
+            tool=name,
+            args_summary=args_summary,
+            ok=ok,
+            latency_ms=latency_ms,
+            error=error,
+        )
+
+    def on_action(self, *, action: AgentAction) -> None:
+        with self._lock:
+            self.last_action = action.action
+        self._push("action", action=action.action, confidence=action.confidence)
+
+    def on_finding(self, *, finding: Finding) -> None:
+        with self._lock:
+            self.findings_count += 1
+        self._push(
+            "finding",
+            field=finding.field,
+            category=finding.category,
+            severity=str(finding.severity),
+        )
+
+    def on_eval(
+        self,
+        *,
+        target: Literal["action", "finding", "run"],
+        verdict: str,
+        score: float,
+        evaluator: str,
+    ) -> None:
+        self._push(
+            "eval", target=target, verdict=verdict, score=score, evaluator=evaluator
+        )
+
+    def on_run_complete(self, *, report: AuditReport) -> None:
+        self._push(
+            "run_complete",
+            findings_count=len(report.findings),
+            duration_ms=int(report.duration_seconds * 1000),
+            usd_total=(report.cost.usd_total if report.cost is not None else None),
+        )