Skip to content

Commit 052d03a

Browse files
authored
CI fixes (#290)
* fix: defer graph init to background, prevent health-check timeout Move RetrieverGraph construction out of module-level import in conversations.py and into a background thread spawned during the FastAPI lifespan. This lets the server start instantly so the Docker health-check passes within the reduced 30 s start_period (instead of timing out after 22+ min waiting for FAISS embedding). - Replace module-level rg = RetrieverGraph(...) with lazy singleton (get_graph / start_graph_init / reset_graph_state_for_testing) - Add /conversations/ready readiness probe returning 'ready' or 'initializing' - Conversation endpoints return 503 / stream error when graph is not yet initialized - Add readiness poll loop (30 min, 10 s intervals) before Run LLM CI step in ci-secret.yaml - Reduce Docker healthcheck start_period default from 1200 s to 30 s - Update streaming tests to use new public reset_graph_state_for_testing() Signed-off-by: Jack Luar <jluar@precisioninno.com> --------- Signed-off-by: Jack Luar <jluar@precisioninno.com>
1 parent b55cc8e commit 052d03a

5 files changed

Lines changed: 92 additions & 27 deletions

File tree

.github/workflows/ci-secret.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,18 @@ jobs:
6363
run: |
6464
make docker-up
6565
66+
- name: Wait for graph readiness
67+
run: |
68+
echo "Waiting for graph to finish initializing..."
69+
for i in $(seq 1 360); do
70+
if curl -sf http://localhost:8000/conversations/ready | grep -q '"ready"'; then
71+
echo "Graph is ready"
72+
break
73+
fi
74+
echo "Waiting for graph initialization... ($i/180)"
75+
sleep 10
76+
done
77+
6678
- name: Run LLM CI
6779
id: llm_tests
6880
working-directory: evaluation

backend/src/api/main.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,12 @@
1111

1212
@asynccontextmanager
1313
async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
14-
"""Initialize database on startup."""
14+
"""Initialize database on startup and start graph background init."""
1515
logger.info("Initializing database connection...")
1616
init_database()
17+
logger.info("Starting graph initialization in background...")
18+
from .routers.conversations import start_graph_init
19+
start_graph_init()
1720
yield
1821
logger.info("Shutting down...")
1922

backend/src/api/routers/conversations.py

Lines changed: 62 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
import os
22
import logging
3+
import threading
34
from dotenv import load_dotenv
45

5-
from typing import Any
6+
from typing import Any, Optional
67
from uuid import UUID
78
from fastapi import APIRouter, Depends, HTTPException
89
from langchain_google_vertexai import ChatVertexAI
@@ -204,17 +205,56 @@ def parse_agent_output(output: list) -> tuple[str, list[ContextSource], list[str
204205
return llm_response, context_sources, tools
205206

206207

207-
rg = RetrieverGraph(
208-
llm_model=llm,
209-
embeddings_config=embeddings_config,
210-
reranking_model_name=hf_reranker,
211-
use_cuda=use_cuda,
212-
inbuilt_tool_calling=True,
213-
fast_mode=fast_mode,
214-
debug=debug,
215-
enable_mcp=enable_mcp,
216-
)
217-
rg.initialize()
208+
_rg: Optional[RetrieverGraph] = None
209+
_rg_started = threading.Event()
210+
_rg_ready = threading.Event()
211+
212+
213+
def get_graph() -> Optional[RetrieverGraph]:
214+
"""Return the initialized graph, or None if not ready yet."""
215+
return _rg if _rg_ready.is_set() else None
216+
217+
218+
def _initialize_graph() -> None:
219+
"""Build and initialize the RetrieverGraph (runs in background thread)."""
220+
global _rg
221+
graph = RetrieverGraph(
222+
llm_model=llm,
223+
embeddings_config=embeddings_config,
224+
reranking_model_name=hf_reranker,
225+
use_cuda=use_cuda,
226+
inbuilt_tool_calling=True,
227+
fast_mode=fast_mode,
228+
debug=debug,
229+
enable_mcp=enable_mcp,
230+
)
231+
graph.initialize()
232+
_rg = graph
233+
_rg_ready.set()
234+
235+
236+
def start_graph_init() -> None:
237+
"""Start graph initialization in a background thread (idempotent)."""
238+
if _rg_started.is_set():
239+
return
240+
_rg_started.set()
241+
threading.Thread(target=_initialize_graph, daemon=True).start()
242+
243+
244+
def reset_graph_state_for_testing() -> None:
245+
"""Reset graph state so tests can simulate a fresh startup."""
246+
global _rg
247+
_rg = None
248+
_rg_started.clear()
249+
_rg_ready.clear()
250+
251+
252+
@router.get("/ready")
253+
async def ready() -> dict[str, str]:
254+
"""Readiness probe — returns 'ready' when the graph is fully initialized."""
255+
if _rg_ready.is_set():
256+
return {"status": "ready"}
257+
return {"status": "initializing"}
218258

219259

220260
chat_history: dict[UUID, list[dict[str, str]]] = {}
@@ -283,10 +323,11 @@ async def get_agent_response(
283323
"chat_history": get_history_str(db, conversation_uuid),
284324
}
285325

286-
if rg.graph is not None:
287-
output = list(rg.graph.stream(inputs, stream_mode="updates"))
326+
graph = get_graph()
327+
if graph is not None and graph.graph is not None:
328+
output = list(graph.graph.stream(inputs, stream_mode="updates"))
288329
else:
289-
raise ValueError("RetrieverGraph not initialized.")
330+
raise HTTPException(status_code=503, detail="Graph is still initializing. Please retry shortly.")
290331

291332
llm_response, context_sources, tools = parse_agent_output(output)
292333

@@ -382,8 +423,9 @@ async def get_response_stream(user_input: UserInput, db: Session | None) -> Any:
382423
current_llm_call_count = 1
383424
chunks: list[str] = []
384425

385-
if rg.graph is not None:
386-
async for event in rg.graph.astream_events(inputs, version="v2"):
426+
graph = get_graph()
427+
if graph is not None and graph.graph is not None:
428+
async for event in graph.graph.astream_events(inputs, version="v2"):
387429
chunk = event["event"]
388430

389431
if chunk == "on_chat_model_end":
@@ -406,6 +448,9 @@ async def get_response_stream(user_input: UserInput, db: Session | None) -> Any:
406448
if msg:
407449
chunks.append(str(msg))
408450
yield str(msg) + "\n\n"
451+
else:
452+
yield "Error: Graph is still initializing. Please retry shortly.\n\n"
453+
return
409454

410455
urls = list(set(urls))
411456
yield f"Sources: {', '.join(urls)}\n\n"

backend/tests/test_api_conversations_streaming.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,13 @@ def mock_retriever_graph():
3737
"""Mock RetrieverGraph for streaming tests."""
3838
# Reset the mock for each test
3939
mock_graph_global.reset_mock()
40+
# Set up the lazy-loaded graph so get_graph() returns the mock
41+
conversations._rg = mock_rg_instance
42+
conversations._rg_started.set()
43+
conversations._rg_ready.set()
4044
yield mock_graph_global
45+
# Teardown: reset graph state for next test
46+
conversations.reset_graph_state_for_testing()
4147

4248

4349
@pytest.fixture
@@ -436,16 +442,15 @@ async def test_get_response_stream_graph_not_initialized(
436442
"""Test behavior when graph is not initialized."""
437443
from src.api.routers.conversations import get_response_stream
438444

439-
with patch("src.api.routers.conversations.rg") as mock_rg:
440-
mock_rg.graph = None
445+
# Reset to simulate a fresh process where the graph hasn't been started yet
446+
conversations.reset_graph_state_for_testing()
441447

442-
chunks = []
443-
async for chunk in get_response_stream(sample_user_input, db_session):
444-
chunks.append(chunk)
448+
chunks = []
449+
async for chunk in get_response_stream(sample_user_input, db_session):
450+
chunks.append(chunk)
445451

446-
# When graph is None, streaming continues but produces no content chunks
447-
# Should still have sources line
448-
assert any("Sources:" in c for c in chunks)
452+
# When graph is None, the stream yields an error and returns early
453+
assert any("still initializing" in c for c in chunks)
449454

450455
@pytest.mark.asyncio
451456
async def test_get_response_stream_empty_content(

docker-compose.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ services:
4545
interval: ${HEALTHCHECK_INTERVAL:-30s}
4646
timeout: ${HEALTHCHECK_TIMEOUT:-10s}
4747
retries: ${HEALTHCHECK_RETRIES:-5}
48-
start_period: ${HEALTHCHECK_START_PERIOD:-1200s}
48+
start_period: ${HEALTHCHECK_START_PERIOD:-30s}
4949

5050
frontend:
5151
build:

0 commit comments

Comments
 (0)