|
21 | 21 | _log_completion, |
22 | 22 | _store_reasoning_traces, |
23 | 23 | _store_tool_calls, |
| 24 | + _stream_llm_call, |
24 | 25 | _update_token_stats_from_chunk, |
25 | 26 | llm_call, |
26 | 27 | ) |
27 | | -from nemoguardrails.context import llm_call_info_var, llm_stats_var, reasoning_trace_var, tool_calls_var |
| 28 | +from nemoguardrails.context import ( |
| 29 | + llm_call_info_var, |
| 30 | + llm_response_metadata_var, |
| 31 | + llm_stats_var, |
| 32 | + reasoning_trace_var, |
| 33 | + tool_calls_var, |
| 34 | +) |
28 | 35 | from nemoguardrails.exceptions import LLMCallException |
29 | 36 | from nemoguardrails.integrations.langchain.llm_adapter import ( |
30 | 37 | LangChainLLMAdapter, |
31 | 38 | _infer_provider_from_module, |
32 | 39 | ) |
33 | 40 | from nemoguardrails.logging.explain import LLMCallInfo |
34 | 41 | from nemoguardrails.logging.stats import LLMStats |
| 42 | +from nemoguardrails.streaming import StreamingHandler |
35 | 43 | from nemoguardrails.types import ChatMessage, LLMResponse, LLMResponseChunk, Role, ToolCall, ToolCallFunction, UsageInfo |
36 | 44 |
|
37 | 45 |
|
@@ -482,3 +490,150 @@ def provider_url(self): |
482 | 490 | await llm_call(model, []) |
483 | 491 |
|
484 | 492 | assert received_prompt == [] |
| 493 | + |
| 494 | + |
| 495 | +def _make_chunk_model(chunks): |
| 496 | + class _Model: |
| 497 | + model_name = "test-model" |
| 498 | + provider_name = "test" |
| 499 | + provider_url = None |
| 500 | + |
| 501 | + async def generate_async(self, prompt, *, stop=None, **kwargs): |
| 502 | + return LLMResponse(content="") |
| 503 | + |
| 504 | + async def stream_async(self, prompt, *, stop=None, **kwargs): |
| 505 | + for c in chunks: |
| 506 | + yield c |
| 507 | + |
| 508 | + return _Model() |
| 509 | + |
| 510 | + |
| 511 | +class TestStreamLlmCallAccumulation: |
| 512 | + @pytest.mark.asyncio |
| 513 | + async def test_accumulates_tool_calls(self): |
| 514 | + tc = [ToolCall(id="call_1", function=ToolCallFunction(name="get_weather", arguments={"city": "Paris"}))] |
| 515 | + model = _make_chunk_model( |
| 516 | + [ |
| 517 | + LLMResponseChunk(model="gpt-4o"), |
| 518 | + LLMResponseChunk(delta_tool_calls=tc, finish_reason="tool_calls"), |
| 519 | + LLMResponseChunk(usage=UsageInfo(input_tokens=10, output_tokens=5, total_tokens=15)), |
| 520 | + ] |
| 521 | + ) |
| 522 | + |
| 523 | + result = await _stream_llm_call(model, "test", StreamingHandler(), stop=None) |
| 524 | + |
| 525 | + assert result.tool_calls == tc |
| 526 | + assert result.model == "gpt-4o" |
| 527 | + assert result.finish_reason == "tool_calls" |
| 528 | + assert result.usage.total_tokens == 15 |
| 529 | + assert tool_calls_var.get() is not None |
| 530 | + |
| 531 | + @pytest.mark.asyncio |
| 532 | + async def test_accumulates_reasoning(self): |
| 533 | + model = _make_chunk_model( |
| 534 | + [ |
| 535 | + LLMResponseChunk(delta_reasoning="Let me ", model="gpt-4o"), |
| 536 | + LLMResponseChunk(delta_reasoning="think..."), |
| 537 | + LLMResponseChunk(delta_content="42", finish_reason="stop"), |
| 538 | + LLMResponseChunk(usage=UsageInfo(input_tokens=5, output_tokens=3, total_tokens=8)), |
| 539 | + ] |
| 540 | + ) |
| 541 | + |
| 542 | + result = await _stream_llm_call(model, "test", StreamingHandler(), stop=None) |
| 543 | + |
| 544 | + assert result.content == "42" |
| 545 | + assert result.reasoning == "Let me think..." |
| 546 | + assert result.model == "gpt-4o" |
| 547 | + assert result.finish_reason == "stop" |
| 548 | + assert reasoning_trace_var.get() == "Let me think..." |
| 549 | + |
| 550 | + @pytest.mark.asyncio |
| 551 | + async def test_text_only(self): |
| 552 | + model = _make_chunk_model( |
| 553 | + [ |
| 554 | + LLMResponseChunk(delta_content="Hello", model="gpt-4o"), |
| 555 | + LLMResponseChunk(delta_content=" world", finish_reason="stop"), |
| 556 | + LLMResponseChunk(usage=UsageInfo(input_tokens=5, output_tokens=2, total_tokens=7)), |
| 557 | + ] |
| 558 | + ) |
| 559 | + |
| 560 | + result = await _stream_llm_call(model, "test", StreamingHandler(), stop=None) |
| 561 | + |
| 562 | + assert result.content == "Hello world" |
| 563 | + assert result.tool_calls is None |
| 564 | + assert result.reasoning is None |
| 565 | + assert result.model == "gpt-4o" |
| 566 | + assert result.finish_reason == "stop" |
| 567 | + assert result.usage.total_tokens == 7 |
| 568 | + |
| 569 | + @pytest.mark.asyncio |
| 570 | + async def test_request_id_accumulated(self): |
| 571 | + model = _make_chunk_model( |
| 572 | + [ |
| 573 | + LLMResponseChunk(delta_content="hi", request_id="req-123", model="gpt-4o"), |
| 574 | + LLMResponseChunk(finish_reason="stop"), |
| 575 | + ] |
| 576 | + ) |
| 577 | + |
| 578 | + result = await _stream_llm_call(model, "test", StreamingHandler(), stop=None) |
| 579 | + |
| 580 | + assert result.request_id == "req-123" |
| 581 | + |
| 582 | + @pytest.mark.asyncio |
| 583 | + async def test_clears_tool_calls_var_when_none(self): |
| 584 | + tool_calls_var.set([{"id": "stale", "type": "function", "function": {"name": "old", "arguments": {}}}]) |
| 585 | + |
| 586 | + model = _make_chunk_model( |
| 587 | + [ |
| 588 | + LLMResponseChunk(delta_content="no tools here", finish_reason="stop"), |
| 589 | + ] |
| 590 | + ) |
| 591 | + |
| 592 | + await _stream_llm_call(model, "test", StreamingHandler(), stop=None) |
| 593 | + |
| 594 | + assert tool_calls_var.get() is None |
| 595 | + |
| 596 | + @pytest.mark.asyncio |
| 597 | + async def test_clears_reasoning_var_when_none(self): |
| 598 | + reasoning_trace_var.set("stale reasoning") |
| 599 | + |
| 600 | + model = _make_chunk_model( |
| 601 | + [ |
| 602 | + LLMResponseChunk(delta_content="no reasoning", finish_reason="stop"), |
| 603 | + ] |
| 604 | + ) |
| 605 | + |
| 606 | + await _stream_llm_call(model, "test", StreamingHandler(), stop=None) |
| 607 | + |
| 608 | + assert reasoning_trace_var.get() is None |
| 609 | + |
| 610 | + @pytest.mark.asyncio |
| 611 | + async def test_provider_metadata_stored_flat(self): |
| 612 | + model = _make_chunk_model( |
| 613 | + [ |
| 614 | + LLMResponseChunk( |
| 615 | + delta_content="hi", |
| 616 | + provider_metadata={"system_fingerprint": "fp_abc"}, |
| 617 | + finish_reason="stop", |
| 618 | + ), |
| 619 | + ] |
| 620 | + ) |
| 621 | + |
| 622 | + await _stream_llm_call(model, "test", StreamingHandler(), stop=None) |
| 623 | + |
| 624 | + metadata = llm_response_metadata_var.get() |
| 625 | + assert metadata == {"system_fingerprint": "fp_abc"} |
| 626 | + |
| 627 | + @pytest.mark.asyncio |
| 628 | + async def test_clears_metadata_var_when_none(self): |
| 629 | + llm_response_metadata_var.set({"stale": True}) |
| 630 | + |
| 631 | + model = _make_chunk_model( |
| 632 | + [ |
| 633 | + LLMResponseChunk(delta_content="no metadata", finish_reason="stop"), |
| 634 | + ] |
| 635 | + ) |
| 636 | + |
| 637 | + await _stream_llm_call(model, "test", StreamingHandler(), stop=None) |
| 638 | + |
| 639 | + assert llm_response_metadata_var.get() is None |
0 commit comments