Skip to content

Commit 0e6d109

Browse files
added mlflow tracing and context size logging. added heartbeat to keep the frontend alive while waiting for long llm calls
1 parent 4611851 commit 0e6d109

4 files changed

Lines changed: 47 additions & 5 deletions

File tree

app.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ env:
2929
- name: MLFLOW_REGISTRY_URI
3030
value: "databricks-uc"
3131
- name: MLFLOW_EXPERIMENT_ID
32-
value: "" # OPTIONAL: Set to your MLflow experiment ID to enable tracing
32+
value: "2624606869914616"
3333

3434
# ---------------------------------------------------------------------------
3535
# LLM Model

backend/routers/create.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""
22
/api/create — UC discovery + config validation + Genie Space creation wizard + agent chat.
33
"""
4+
import asyncio
45
import json
56
import logging
67
from fastapi import APIRouter, HTTPException, Request
@@ -158,13 +159,26 @@ async def agent_chat(body: AgentChatRequest, request: Request):
158159
# that outlive the middleware's call_next).
159160
user_token = getattr(request.state, "user_token", "")
160161

162+
_KEEPALIVE_INTERVAL = 15 # seconds between SSE keepalive comments
163+
161164
async def event_stream():
162165
if user_token:
163166
set_obo_user_token(user_token)
164167
try:
165168
yield _sse_event("session", {"session_id": session.session_id})
166-
async for event in agent.chat(session, user_message):
167-
yield _sse_event(event["event"], event["data"])
169+
170+
agent_iter = agent.chat(session, user_message).__aiter__()
171+
while True:
172+
try:
173+
event = await asyncio.wait_for(
174+
agent_iter.__anext__(), timeout=_KEEPALIVE_INTERVAL
175+
)
176+
yield _sse_event(event["event"], event["data"])
177+
except asyncio.TimeoutError:
178+
yield ": keepalive\n\n"
179+
except StopAsyncIteration:
180+
break
181+
168182
await persist_session(session)
169183
finally:
170184
clear_obo_user_token()

backend/services/create_agent.py

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,21 @@ async def chat(
8686
step = detect_step(session)
8787
step_idx = STEP_ORDER.index(step) if step in STEP_ORDER else 0
8888

89+
# Start a root MLflow trace for the entire agent turn.
90+
# Wrapped in try/except so tracing failures never break the app.
91+
_trace = None
92+
try:
93+
_trace = mlflow.start_trace(
94+
name="agent_chat",
95+
inputs={"user_message": user_message, "session_id": session.session_id, "step": step},
96+
)
97+
mlflow.update_current_trace(tags={
98+
"session_id": session.session_id,
99+
"workflow_step": step,
100+
})
101+
except Exception:
102+
logger.debug("MLflow start_trace failed, continuing without tracing", exc_info=True)
103+
89104
yield {"event": "step", "data": {
90105
"step": step,
91106
"label": STEP_LABELS.get(step, step),
@@ -301,9 +316,20 @@ async def chat(
301316
step, tools_used, error_msg,
302317
)
303318

319+
try:
320+
if _trace is not None:
321+
status = "ERROR" if error_msg else "OK"
322+
mlflow.end_trace(
323+
request_id=_trace.request_id,
324+
outputs={"tools_used": tools_used, "error": error_msg},
325+
status=status,
326+
)
327+
except Exception:
328+
logger.debug("MLflow end_trace failed", exc_info=True)
329+
304330
yield {"event": "done", "data": {}}
305331

306-
_TOOL_RESULT_CHAR_LIMIT = 6000
332+
_TOOL_RESULT_CHAR_LIMIT = 3000
307333
_COMPRESSIBLE_TOOLS = frozenset({
308334
"describe_table", "profile_columns", "profile_table_usage",
309335
"assess_data_quality", "test_sql",

frontend/src/lib/api.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -687,7 +687,9 @@ export function streamAgentChat(
687687
})
688688
.catch((error) => {
689689
if (error.name !== "AbortError") {
690-
callbacks.onError(error.message || "Connection failed")
690+
callbacks.onError(error.message === "network error"
691+
? "Connection interrupted — your progress is saved. Send another message to continue."
692+
: (error.message || "Connection failed"))
691693
callbacks.onDone()
692694
}
693695
})

0 commit comments

Comments
 (0)