77C5 adds:
88 - POST /troubleshoot/user-reply (matches Phase 11 contract)
99 - POST /troubleshoot/phone-context (matches Phase 11 contract)
10+
11+ 2026-05-28 resume support:
12+ - POST /troubleshoot now starts the generator as a detached
13+ `asyncio.create_task`; the SSE handler is just a buffer reader.
14+ Disconnecting the SSE consumer no longer cancels the model run.
15+ - GET /troubleshoot/resume?session_id=X&from=N reattaches to the
16+ same task's buffered output. App-side persistence (lastEventSeq
17+ in AsyncStorage) plus an AppState foreground subscriber makes
18+ background-then-resume seamless.
1019"""
1120from __future__ import annotations
1221
22+ import asyncio
1323import json
1424import logging
25+ from typing import AsyncIterator
1526
16- from fastapi import APIRouter , HTTPException , Request , Response
27+ from fastapi import APIRouter , HTTPException , Query , Request , Response
1728from fastapi .responses import JSONResponse , StreamingResponse
1829from pydantic import BaseModel , ConfigDict , Field
1930
20- from src .session .manager import sanitize_for_log
31+ from src .session .manager import SessionState , sanitize_for_log
2132from src .session .tool_call_loop import stream_troubleshoot
2233
2334
@@ -61,6 +72,124 @@ def _error_body(code: str, detail: str = "") -> dict:
6172 return out
6273
6374
75+ async def _drive_generator_into_buffer (
76+ session : SessionState ,
77+ backend ,
78+ tool_executor ,
79+ validator ,
80+ prompt : str ,
81+ ) -> None :
82+ """Run the bridge generator detached from any SSE consumer, writing
83+ each event into the session's buffer. SSE consumers come and go via
84+ `_stream_from_buffer`; this task is the SOLE writer. Survives
85+ consumer disconnect — that's the whole point of the resume feature.
86+
87+ On exception we still try to append a synthetic error event so the
88+ consumer can render the failure instead of staring at a hanging
89+ chat. `mark_done` is in the finally so even hard exceptions release
90+ the consumers' `cond.wait()`.
91+ """
92+ backend_handles_tools = getattr (backend , "consumes_tool_results" , False )
93+ try :
94+ backend_events = backend .run_troubleshoot (
95+ prompt = prompt ,
96+ session_id = session .session_id ,
97+ )
98+ async for event in stream_troubleshoot (
99+ backend_events , tool_executor , validator ,
100+ session = session ,
101+ backend_handles_tools = backend_handles_tools ,
102+ ):
103+ await session .append_event (event )
104+ except asyncio .CancelledError :
105+ # Container shutdown or explicit cancel — propagate after
106+ # marking done so consumers exit.
107+ await session .mark_done ()
108+ raise
109+ except Exception as e : # noqa: BLE001
110+ logger .exception ("background generator failed session=%s" , session .session_id )
111+ try :
112+ await session .append_event ({
113+ "type" : "error" ,
114+ "code" : "INTERNAL_ERROR" ,
115+ "message" : str (e )[:200 ],
116+ "recoverable" : False ,
117+ })
118+ except Exception :
119+ pass # last-ditch; buffer write itself shouldn't crash
120+ finally :
121+ await session .mark_done ()
122+
123+
124+ async def _stream_from_buffer (
125+ session : SessionState ,
126+ from_seq : int ,
127+ ) -> AsyncIterator [str ]:
128+ """Yield SSE-formatted strings from the session's event buffer
129+ starting at `from_seq`, blocking on `session.cond` for new events
130+ until the generator marks itself done OR a newer consumer claims
131+ this session (last-wins policy).
132+
133+ SSE `id:` field carries the seq number so the client's
134+ EventSource records it on lastEventId; the client persists this to
135+ AsyncStorage and supplies it as `?from=` on the next reconnect.
136+
137+ Truncation marker: if `from_seq` is less than the oldest buffered
138+ seq (the consumer was away long enough for events to fall off the
139+ cap), inject a synthetic `thought` event with the dropped count.
140+ Per advisor input we don't grow the SSE schema for a flow-control
141+ concern — a `thought` event with the marker text is enough; the
142+ chat surface renders it as italic gray prose."""
143+ my_generation = session .consumer_generation
144+ last_yielded_seq = from_seq - 1
145+
146+ # Truncation detection — fires when the consumer is asking for an
147+ # event older than what's still in the buffer. Includes the
148+ # from_seq=0 case (fresh resume after the head of the buffer
149+ # already overflowed).
150+ if (
151+ session .event_buffer
152+ and session .event_buffer [0 ][0 ] > from_seq
153+ ):
154+ gap = session .event_buffer [0 ][0 ] - from_seq
155+ marker = {
156+ "type" : "thought" ,
157+ "payload" : (
158+ f"[resume] { gap } earlier event(s) dropped from the on-device "
159+ f"buffer (cap 500). Newer events from this session continue below."
160+ ),
161+ }
162+ # Emit the marker with a special id=-1 (no real seq) so the
163+ # client's lastEventId tracker doesn't try to use it as a
164+ # resume offset later.
165+ yield f"id: -1\n data: { json .dumps (marker , separators = (',' , ':' ))} \n \n "
166+
167+ while True :
168+ # Yield any buffered events strictly newer than last_yielded.
169+ # Materialize so we don't hold the buffer reference across the
170+ # await (the buffer is mutated by the producer task).
171+ new_events = [
172+ (s , e ) for s , e in session .event_buffer if s > last_yielded_seq
173+ ]
174+ for s , e in new_events :
175+ yield f"id: { s } \n data: { json .dumps (e , separators = (',' , ':' ))} \n \n "
176+ last_yielded_seq = s
177+
178+ # Check exit conditions BEFORE waiting (covers the case where
179+ # mark_done already fired or a new consumer already took over).
180+ if session .generator_done :
181+ return
182+ if session .consumer_generation != my_generation :
183+ # Last-wins: another /troubleshoot or /resume call took
184+ # over this session. Quietly exit so the network frame
185+ # stream isn't doubled. The newer consumer reads the same
186+ # buffer and picks up from from_seq.
187+ return
188+
189+ async with session .cond :
190+ await session .cond .wait ()
191+
192+
64193@router .post ("/troubleshoot" )
65194async def troubleshoot (req : TroubleshootRequest , request : Request ) -> Response :
66195 backend = request .app .state .backend
@@ -69,7 +198,7 @@ async def troubleshoot(req: TroubleshootRequest, request: Request) -> Response:
69198 session_mgr = request .app .state .session_manager
70199
71200 # Resolve session: caller-supplied wins; else mint new.
72- session = None
201+ session : SessionState | None = None
73202 if req .session_id :
74203 session = session_mgr .get (req .session_id )
75204 if session is None :
@@ -80,30 +209,47 @@ async def troubleshoot(req: TroubleshootRequest, request: Request) -> Response:
80209 else :
81210 session = session_mgr .create ()
82211
83- backend_events = backend .run_troubleshoot (
84- prompt = req .prompt ,
85- session_id = session .session_id ,
86- )
212+ # If this session already has a running generator (caller retried
213+ # POST on the same session_id while a previous task is still
214+ # active), reject. Resume via GET /troubleshoot/resume instead so
215+ # we don't spawn duplicate writers into one buffer.
216+ if (
217+ session .generator_task is not None
218+ and not getattr (session .generator_task , "done" , lambda : True )()
219+ and not session .generator_done
220+ ):
221+ return JSONResponse (
222+ status_code = 409 ,
223+ content = _error_body (
224+ "session_already_active" ,
225+ "use GET /troubleshoot/resume?session_id=...&from=N to reattach" ,
226+ ),
227+ )
87228
88- backend_handles_tools = getattr (backend , "consumes_tool_results" , False )
229+ # Reset per-session buffer state for the new generator. We keep the
230+ # SessionState (so phone_context, reply_queue, etc. survive) but
231+ # wipe the conversation buffer so the new prompt starts at seq 0.
232+ session .event_buffer = []
233+ session .next_seq = 0
234+ session .dropped_count = 0
235+ session .generator_done = False
236+ session .consumer_generation += 1
237+ my_generation = session .consumer_generation
238+
239+ session .generator_task = asyncio .create_task (
240+ _drive_generator_into_buffer (
241+ session , backend , tool_executor , validator , req .prompt ,
242+ ),
243+ name = f"blox-ai-generator-{ session .session_id } " ,
244+ )
89245
90246 async def sse_stream ():
247+ # consumer_generation was bumped above; capture it for the
248+ # last-wins check in _stream_from_buffer.
249+ session .consumer_generation = my_generation
91250 try :
92- async for event in stream_troubleshoot (
93- backend_events , tool_executor , validator ,
94- session = session ,
95- backend_handles_tools = backend_handles_tools ,
96- ):
97- yield f"data: { json .dumps (event , separators = (',' , ':' ))} \n \n "
98- except Exception :
99- logger .exception ("unexpected bridge failure" )
100- fallback = {
101- "type" : "error" ,
102- "code" : "INTERNAL_ERROR" ,
103- "message" : "unexpected bridge failure" ,
104- "recoverable" : False ,
105- }
106- yield f"data: { json .dumps (fallback , separators = (',' , ':' ))} \n \n "
251+ async for chunk in _stream_from_buffer (session , from_seq = 0 ):
252+ yield chunk
107253 finally :
108254 # Slide TTL on stream completion so a session that just
109255 # finished a turn doesn't expire instantly.
@@ -119,6 +265,54 @@ async def sse_stream():
119265 )
120266
121267
268+ @router .get ("/troubleshoot/resume" )
269+ async def troubleshoot_resume (
270+ request : Request ,
271+ session_id : str = Query (min_length = 1 , max_length = 128 ),
272+ from_seq : int = Query (alias = "from" , default = 0 , ge = 0 ),
273+ ) -> Response :
274+ """Reattach to a /troubleshoot session's existing generator output.
275+ Returns 404 if the session was evicted (TTL, LRU, or container
276+ restart); the client clears its persisted state on 404 + offers
277+ Start-new-chat.
278+
279+ Replays buffered events newer than `from_seq`, injects a
280+ truncation marker if `from_seq` is older than the oldest buffered
281+ event, then blocks on the session's cond for new events until the
282+ generator marks itself done or a newer consumer takes over.
283+
284+ Idempotent: multiple consumers can call /resume; last-wins kicks
285+ the older consumer (it exits cleanly without re-emitting events).
286+ """
287+ session_mgr = request .app .state .session_manager
288+ session = session_mgr .get (session_id )
289+ if session is None :
290+ return JSONResponse (
291+ status_code = 404 ,
292+ content = _error_body ("session_not_found" ),
293+ )
294+ # Slide TTL — resume IS user activity.
295+ session_mgr .touch (session .session_id )
296+ # Last-wins: bump generation so any prior consumer's loop exits.
297+ session .consumer_generation += 1
298+
299+ async def sse_stream ():
300+ try :
301+ async for chunk in _stream_from_buffer (session , from_seq = from_seq ):
302+ yield chunk
303+ finally :
304+ session_mgr .touch (session .session_id )
305+
306+ return StreamingResponse (
307+ sse_stream (),
308+ media_type = "text/event-stream" ,
309+ headers = {
310+ "Cache-Control" : "no-cache" ,
311+ "X-Accel-Buffering" : "no" ,
312+ },
313+ )
314+
315+
122316@router .post ("/troubleshoot/user-reply" )
123317async def user_reply (req : UserReplyRequest , request : Request ) -> Response :
124318 """Phase 11 contract: the app submits this when the user answers a
0 commit comments