Fix and improve integration tests (#1996)

dagardner-nv · web-flow · commit 59a4fde932d6 · 2026-05-29T23:39:57.000Z
* Fix docker builds for the local sandbox * Disable raga ai e2e test as https://catalyst.raga.ai is currently down' * Rename `test_tool_wrapper.py` to `test_lc_tool_wrapper.py` avoiding a pytest name clash * Update agent tests to accept a list of acceptable answers (ex: allowing both "LLM" and "large language model") * Workflow improvements: - Replacing older models with a nemotron model - Increasing max tokens and disabling thinking to prevent truncation ## By Submitting this PR I confirm: - I am familiar with the [Contributing Guidelines](https://github.com/NVIDIA/NeMo-Agent-Toolkit/blob/develop/docs/source/resources/contributing/index.md). - We require that all contributors "sign-off" on their commits. This certifies that the contribution is your original work, or you have rights to submit it under the same license, or a compatible license. - Any contribution which contains commits that are not Signed-Off will not be accepted. - When the PR is ready for review, new or existing tests cover these changes. - When the PR is ready for review, the documentation is up to date with these changes. ## Summary by CodeRabbit * **New Features** * N/A * **Bug Fixes** * Skipped flaky external-service test to avoid CI failures * **Chores** * Updated example configs and notebook to use a different recommended model, increase max tokens to 1024, and disable “thinking” * Adjusted CI image tag and JSON model counts * Made Docker upgrades non-interactive * **Tests** * Enhanced test utilities and fixtures to accept and validate multiple acceptable answers [![Review Change Stack](https://storage.googleapis.com/coderabbit_public_assets/review-stack-in-coderabbit-ui.svg)](https://app.coderabbit.ai/change-stack/NVIDIA/NeMo-Agent-Toolkit/pull/1996?utm_source=github_walkthrough&utm_medium=github&utm_campaign=change_stack) Authors: - David Gardner (https://github.com/dagardner-nv) Approvers: - Yuchen Zhang (https://github.com/yczhang-nv) - Will Killian (https://github.com/willkill07) URL: #1996
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -134,7 +134,7 @@ test:python_tests:
       alias: phoenix
       pull_policy: if-not-present
 
-    - name: $CI_REGISTRY_IMAGE/nginx-rewrite-models:20260303
+    - name: $CI_REGISTRY_IMAGE/nginx-rewrite-models:20260529
       alias: nginx-rewrite-models
       pull_policy: if-not-present
 
diff --git a/ci/.nim_models_used.json b/ci/.nim_models_used.json
@@ -3,11 +3,11 @@
   "llms": [
     {
       "model": "meta/llama-3.1-70b-instruct",
-      "num_configs": 53
+      "num_configs": 51
     },
     {
       "model": "nvidia/nemotron-3-nano-30b-a3b",
-      "num_configs": 40
+      "num_configs": 42
     },
     {
       "model": "meta/llama-3.3-70b-instruct",
diff --git a/examples/agents/react/configs/config-reasoning.yml b/examples/agents/react/configs/config-reasoning.yml
@@ -17,9 +17,12 @@
 llms:
   nim_llm:
     _type: nim
-    model_name: meta/llama-3.1-70b-instruct
+    model_name: nvidia/nemotron-3-nano-30b-a3b
     temperature: 0.0
-    max_tokens: 250
+    max_tokens: 1024
+    chat_template_kwargs:
+      enable_thinking: false
+
   nemotron_model:
     _type: nim
     model_name: nvidia/llama-3.3-nemotron-super-49b-v1
diff --git a/examples/agents/react/configs/config.yml b/examples/agents/react/configs/config.yml
@@ -19,7 +19,9 @@ llms:
     _type: nim
     model_name: nvidia/nemotron-3-nano-30b-a3b
     temperature: 0.0
-    max_tokens: 250
+    max_tokens: 1024
+    chat_template_kwargs:
+      enable_thinking: false
 
 functions:
   wikipedia_search:
diff --git a/examples/agents/tests/conftest.py b/examples/agents/tests/conftest.py
@@ -43,9 +43,9 @@ def fixture_question() -> str:
     return "What are LLMs"
 
 
-@pytest.fixture(name="answer", scope="session")
-def fixture_answer() -> str:
-    return "large language model"
+@pytest.fixture(name="expected_answers", scope="session")
+def fixture_expected_answers() -> list[str]:
+    return ["llm", "large language model"]
 
 
 @pytest.fixture(name="rewoo_data", scope="session")
diff --git a/examples/agents/tests/test_agents.py b/examples/agents/tests/test_agents.py
@@ -35,10 +35,14 @@ def _extract_serve_response_text(response_json: dict) -> str:
     return "\n".join(combined)
 
 
-def _assert_expected_answer(result: str, expected_answer: str) -> None:
+def _assert_expected_answer(result: str, expected_answers: str | list[str]) -> None:
     """Assert that the expected answer appears in the result, normalizing whitespace and case."""
-    normalized = ' '.join(result.split())
-    assert expected_answer.lower() in normalized.lower(), f"Expected '{expected_answer}' in '{result}'"
+    normalized = ' '.join(result.split()).lower()
+    if isinstance(expected_answers, str):
+        expected_answers = [expected_answers]
+
+    assert any(expected.lower() in normalized for expected in expected_answers), \
+        f"Expected one of '{expected_answers}' in '{result}'"
 
 
 # ---------------------------------------------------------------------------
@@ -82,19 +86,21 @@ async def test_question(self, rewoo_nat_client, rewoo_data: list[dict], qa_idx:
 
 @pytest.mark.integration
 @pytest.mark.usefixtures("openai_api_key")
-async def test_tool_calling_responses_api(agents_dir: Path, question: str, answer: str):
+async def test_tool_calling_responses_api(agents_dir: Path, question: str, expected_answers: list[str]):
     await run_workflow(config_file=agents_dir / "tool_calling/configs/config-responses-api.yml",
                        question=question,
-                       expected_answer=answer)
+                       expected_answer=expected_answers)
 
 
 @pytest.mark.integration
 @pytest.mark.usefixtures("openai_api_key")
-async def test_nat_run_tool_calling_responses_api(tool_calling_responses_api_nat_client, question: str, answer: str):
+async def test_nat_run_tool_calling_responses_api(tool_calling_responses_api_nat_client,
+                                                  question: str,
+                                                  expected_answers: list[str]):
     resp = await tool_calling_responses_api_nat_client.post("/generate", json={"input_message": question})
     resp.raise_for_status()
     response_text = _extract_serve_response_text(resp.json())
-    _assert_expected_answer(response_text, answer)
+    _assert_expected_answer(response_text, expected_answers)
 
 
 # ---------------------------------------------------------------------------
@@ -107,23 +113,23 @@ async def test_nat_run_tool_calling_responses_api(tool_calling_responses_api_nat
 @pytest.mark.usefixtures("nvidia_api_key")
 class TestAgentNatRun:
 
-    async def test_question(self, agent_session_manager, question: str, answer: str):
+    async def test_question(self, agent_session_manager, question: str, expected_answers: list[str]):
         async with agent_session_manager.session() as session:
             async with session.run(question) as runner:
                 result = await runner.result(to_type=str)
-                _assert_expected_answer(result, answer)
+                _assert_expected_answer(result, expected_answers)
 
 
 @pytest.mark.slow
 @pytest.mark.integration
 @pytest.mark.usefixtures("nvidia_api_key")
 class TestAgentNatServe:
 
-    async def test_question(self, agent_nat_client, question: str, answer: str):
+    async def test_question(self, agent_nat_client, question: str, expected_answers: list[str]):
         resp = await agent_nat_client.post("/generate", json={"messages": [{"role": "user", "content": question}]})
         resp.raise_for_status()
         response_text = _extract_serve_response_text(resp.json())
-        _assert_expected_answer(response_text, answer)
+        _assert_expected_answer(response_text, expected_answers)
 
 
 # Code examples from `docs/source/resources/running-tests.md`
diff --git a/examples/agents/tool_calling/configs/config-reasoning.yml b/examples/agents/tool_calling/configs/config-reasoning.yml
@@ -19,7 +19,10 @@ llms:
     _type: nim
     model_name: nvidia/nemotron-3-nano-30b-a3b
     temperature: 0.0
-    max_tokens: 250
+    max_tokens: 1024
+    chat_template_kwargs:
+      enable_thinking: false
+
   reasoning_model:
     _type: nim
     model_name: nvidia/llama-3.3-nemotron-super-49b-v1
diff --git a/examples/agents/tool_calling/configs/config.yml b/examples/agents/tool_calling/configs/config.yml
@@ -17,9 +17,11 @@
 llms:
   nim_llm:
     _type: nim
-    model_name: meta/llama-3.1-70b-instruct
+    model_name: nvidia/nemotron-3-nano-30b-a3b
     temperature: 0.0
-    max_tokens: 250
+    max_tokens: 1024
+    chat_template_kwargs:
+      enable_thinking: false
 
 functions:
   wikipedia_search:
diff --git a/examples/notebooks/hello_world.ipynb b/examples/notebooks/hello_world.ipynb
@@ -179,7 +179,7 @@
     "   # Tell NeMo Agent Toolkit which LLM to use for the agent\n",
     "   nim_llm:\n",
     "      _type: nim\n",
-    "      model_name: meta/llama-3.1-70b-instruct\n",
+    "      model_name: nvidia/nemotron-3-nano-30b-a3b\n",
     "      temperature: 0.0\n",
     "\n",
     "workflow:\n",
diff --git a/examples/observability/simple_calculator_observability/tests/test_simple_calc_observability.py b/examples/observability/simple_calculator_observability/tests/test_simple_calc_observability.py
@@ -242,6 +242,7 @@ async def test_galileo_full_workflow(config_dir: Path,
     assert len(spans.records) > 1
 
 
+@pytest.mark.skip(reason="https://catalyst.raga.ai appears to be having issues")
 @pytest.mark.integration
 @pytest.mark.usefixtures("catalyst_keys", "aiq_compatibility_span_prefix")
 async def test_catalyst_full_workflow(config_dir: Path,
diff --git a/packages/nvidia_nat_core/src/nat/tool/code_execution/local_sandbox/Dockerfile.sandbox b/packages/nvidia_nat_core/src/nat/tool/code_execution/local_sandbox/Dockerfile.sandbox
@@ -22,7 +22,7 @@ FROM python:3.13-slim-bookworm
 
 
 RUN apt update && \
-    apt upgrade && \
+    apt upgrade -y && \
     apt install -y --no-install-recommends libexpat1 && \
     apt clean && \
     rm -rf /var/lib/apt/lists/*
diff --git a/packages/nvidia_nat_langchain/tests/test_lc_tool_wrapper.py b/packages/nvidia_nat_langchain/tests/test_lc_tool_wrapper.py
diff --git a/packages/nvidia_nat_test/src/nat/test/utils.py b/packages/nvidia_nat_test/src/nat/test/utils.py
@@ -74,7 +74,7 @@ async def run_workflow(*,
                        config: "Config | None" = None,
                        config_file: "StrPath | None" = None,
                        question: str,
-                       expected_answer: str | None = None,
+                       expected_answer: list[str] | str | None = None,
                        assert_expected_answer: bool = True,
                        **kwargs) -> str:
     """
@@ -86,17 +86,21 @@ async def run_workflow(*,
     result = await nat_run_workflow(config=config, config_file=config_file, prompt=question, to_type=str, **kwargs)
 
     if expected_answer is not None and assert_expected_answer:
+        if isinstance(expected_answer, str):
+            expected_answer = [expected_answer]
+
         # sometimes LLMs use fancy unicode space characters like \u202f, normalize before comparing
-        normalized_result = ' '.join(result.split())
-        assert expected_answer.lower() in normalized_result.lower(), f"Expected '{expected_answer}' in '{result}'"
+        normalized_result = ' '.join(result.split()).lower()
+        assert any(expected.lower() in normalized_result for expected in expected_answer), \
+            f"Expected one of '{expected_answer}' in '{result}'"
 
     return result
 
 
 async def serve_workflow(*,
                          config_path: Path,
                          question: str,
-                         expected_answer: str | None = None,
+                         expected_answer: list[str] | str | None = None,
                          assert_expected_answer: bool = True,
                          port: int = 8000,
                          pipeline_timeout: int = 60,
@@ -140,7 +144,11 @@ async def serve_workflow(*,
             response_text = "\n".join(combined_response)
 
         if expected_answer is not None and assert_expected_answer:
-            assert expected_answer.lower() in response_text.lower(), \
+            if isinstance(expected_answer, str):
+                expected_answer = [expected_answer]
+
+            normalized_result = ' '.join(response_text.split()).lower()
+            assert any(expected.lower() in normalized_result for expected in expected_answer), \
                 f"Unexpected response: {response.text}"
     finally:
         # Teardown

Original file line number	Diff line number	Diff line change
`@@ -3,11 +3,11 @@`
`3`	`3`	`"llms": [`
`4`	`4`	`{`
`5`	`5`	`"model": "meta/llama-3.1-70b-instruct",`
`6`		`- "num_configs": 53`
	`6`	`+ "num_configs": 51`
`7`	`7`	`},`
`8`	`8`	`{`
`9`	`9`	`"model": "nvidia/nemotron-3-nano-30b-a3b",`
`10`		`- "num_configs": 40`
	`10`	`+ "num_configs": 42`
`11`	`11`	`},`
`12`	`12`	`{`
`13`	`13`	`"model": "meta/llama-3.3-70b-instruct",`