fix(llm): avoid Groq schema-tool failures during guideline generation (#264)

gaodan-fang · web-flow · commit 5b7bc7cf10e7 · 2026-06-01T23:44:08.000-04:00
Groq gpt-oss can report response-schema support through LiteLLM while still failing when the model returns normal text instead of a required tool call. Guideline generation and guideline consolidation now keep Groq on the existing JSON prompt-and-parse path while leaving structured outputs enabled for providers that handle them reliably.

Constraint: Groq openai/gpt-oss-120b may reject schema/tool-backed responses when no tool call is produced
Constraint: Upstream renamed tip generation paths to guideline generation during rebase
Rejected: Add a Groq SDK client | LiteLLM already fronts all generation paths and the failure is limited to schema routing
Rejected: Disable structured outputs globally | OpenAI-compatible providers still benefit from strict schema mode
Confidence: high
Scope-risk: narrow
Directive: Do not re-enable response_format for Groq guideline generation without live testing groq/openai/gpt-oss-120b tool/schema behavior
Tested: uv run pytest tests/unit/test_guidelines.py tests/unit/test_combine_guidelines.py -v
Tested: uv run ruff check altk_evolve/llm/guidelines/guidelines.py altk_evolve/llm/guidelines/clustering.py tests/unit/test_guidelines.py tests/unit/test_combine_guidelines.py
Tested: uv run ruff format --check altk_evolve/llm/guidelines/guidelines.py altk_evolve/llm/guidelines/clustering.py tests/unit/test_guidelines.py tests/unit/test_combine_guidelines.py
diff --git a/altk_evolve/llm/guidelines/clustering.py b/altk_evolve/llm/guidelines/clustering.py
@@ -141,6 +141,7 @@ def combine_cluster(entities: list[RecordedEntity]) -> list[Guideline]:
     Raises:
         EvolveException: If the LLM call fails after 3 attempts.
     """
+    is_groq = llm_settings.custom_llm_provider == "groq" or llm_settings.guidelines_model.startswith("groq/")
     supported_params = get_supported_openai_params(
         model=llm_settings.guidelines_model,
         custom_llm_provider=llm_settings.custom_llm_provider,
@@ -150,7 +151,7 @@ def combine_cluster(entities: list[RecordedEntity]) -> list[Guideline]:
         model=llm_settings.guidelines_model,
         custom_llm_provider=llm_settings.custom_llm_provider,
     )
-    constrained_decoding_supported = supports_response_format and response_schema_enabled
+    constrained_decoding_supported = not is_groq and supports_response_format and response_schema_enabled
 
     # Deduplicate task descriptions
     task_descriptions = list(
diff --git a/altk_evolve/llm/guidelines/guidelines.py b/altk_evolve/llm/guidelines/guidelines.py
@@ -171,6 +171,7 @@ def generate_guidelines(messages: list[dict]) -> list[GuidelineGenerationResult]
     Returns a list with one GuidelineGenerationResult per subtask (or one for the full
     trajectory when segmentation is disabled or produces fewer than 2 subtasks).
     """
+    is_groq = llm_settings.custom_llm_provider == "groq" or llm_settings.guidelines_model.startswith("groq/")
     supported_params = get_supported_openai_params(
         model=llm_settings.guidelines_model,
         custom_llm_provider=llm_settings.custom_llm_provider,
@@ -180,7 +181,7 @@ def generate_guidelines(messages: list[dict]) -> list[GuidelineGenerationResult]
         model=llm_settings.guidelines_model,
         custom_llm_provider=llm_settings.custom_llm_provider,
     )
-    constrained_decoding_supported = bool(supports_response_format and response_schema_enabled)
+    constrained_decoding_supported = bool(not is_groq and supports_response_format and response_schema_enabled)
 
     trajectory_data = parse_openai_agents_trajectory(messages)
     task_instruction = trajectory_data["task_instruction"]
diff --git a/tests/unit/test_combine_guidelines.py b/tests/unit/test_combine_guidelines.py
@@ -6,10 +6,11 @@
 
 import pytest
 
+from altk_evolve.llm.guidelines import clustering as clustering_module
 from altk_evolve.llm.guidelines.clustering import combine_cluster
 from altk_evolve.schema.core import RecordedEntity
 from altk_evolve.schema.exceptions import EvolveException
-from altk_evolve.schema.guidelines import Guideline, ConsolidationResult
+from altk_evolve.schema.guidelines import ConsolidationResult, Guideline
 
 
 def _make_entity(entity_id: str, content: str, task_description: str = "do a task") -> RecordedEntity:
@@ -110,7 +111,9 @@ def test_combine_cluster_raises_after_max_retries(self, _mock_params, _mock_sche
     @patch("altk_evolve.llm.guidelines.clustering.completion")
     @patch("altk_evolve.llm.guidelines.clustering.supports_response_schema", return_value=True)
     @patch("altk_evolve.llm.guidelines.clustering.get_supported_openai_params", return_value=["response_format"])
-    def test_combine_cluster_uses_structured_output(self, _mock_params, _mock_schema, mock_completion):
+    def test_combine_cluster_uses_structured_output(self, _mock_params, _mock_schema, mock_completion, monkeypatch):
+        monkeypatch.setattr(clustering_module.llm_settings, "guidelines_model", "gpt-4o")
+        monkeypatch.setattr(clustering_module.llm_settings, "custom_llm_provider", "openai")
         mock_completion.return_value = _mock_completion_response(SAMPLE_GUIDELINES[:1])
 
         entities = [_make_entity("1", "Guideline A"), _make_entity("2", "Guideline B")]
@@ -121,6 +124,29 @@ def test_combine_cluster_uses_structured_output(self, _mock_params, _mock_schema
         _, kwargs = mock_completion.call_args
         assert "response_format" in kwargs
 
+    @patch("altk_evolve.llm.guidelines.clustering.completion")
+    @patch("altk_evolve.llm.guidelines.clustering.supports_response_schema", return_value=True)
+    @patch("altk_evolve.llm.guidelines.clustering.get_supported_openai_params", return_value=["response_format"])
+    def test_combine_cluster_uses_json_prompt_for_groq_even_when_schema_is_reported(
+        self,
+        _mock_params,
+        _mock_schema,
+        mock_completion,
+        monkeypatch,
+    ):
+        monkeypatch.setattr(clustering_module.llm_settings, "guidelines_model", "groq/openai/gpt-oss-120b")
+        monkeypatch.setattr(clustering_module.llm_settings, "custom_llm_provider", "groq")
+        mock_completion.return_value = _mock_completion_response(SAMPLE_GUIDELINES[:1])
+
+        entities = [_make_entity("1", "Guideline A"), _make_entity("2", "Guideline B")]
+        result = combine_cluster(entities)
+
+        assert len(result) == 1
+        _, kwargs = mock_completion.call_args
+        assert "response_format" not in kwargs
+        assert kwargs["custom_llm_provider"] == "groq"
+        assert "Output Format (JSON)" in kwargs["messages"][0]["content"]
+
 
 # ---------------------------------------------------------------------------
 # consolidate_guidelines tests
diff --git a/tests/unit/test_guidelines.py b/tests/unit/test_guidelines.py
@@ -1,8 +1,19 @@
 """Tests for guideline generation utilities."""
 
+import json
+from unittest.mock import MagicMock, patch
+
 import pytest
 
-from altk_evolve.llm.guidelines.guidelines import parse_openai_agents_trajectory
+from altk_evolve.llm.guidelines import guidelines as guidelines_module
+from altk_evolve.llm.guidelines.guidelines import generate_guidelines, parse_openai_agents_trajectory
+
+
+def _mock_completion_response(payload: dict) -> MagicMock:
+    response = MagicMock()
+    response.choices = [MagicMock()]
+    response.choices[0].message.content = json.dumps(payload)
+    return response
 
 
 @pytest.mark.unit
@@ -23,3 +34,38 @@ def test_fallback_when_no_user_message(self):
     def test_fallback_when_empty_messages(self):
         result = parse_openai_agents_trajectory([])
         assert result["task_instruction"] == "Task description unknown"
+
+    @patch("altk_evolve.llm.guidelines.guidelines.completion")
+    @patch("altk_evolve.llm.guidelines.guidelines.supports_response_schema", return_value=True)
+    @patch("altk_evolve.llm.guidelines.guidelines.get_supported_openai_params", return_value=["response_format"])
+    def test_generate_guidelines_uses_json_prompt_for_groq_even_when_schema_is_reported(
+        self,
+        _mock_params,
+        _mock_schema,
+        mock_completion,
+        monkeypatch,
+    ):
+        monkeypatch.setattr(guidelines_module.llm_settings, "guidelines_model", "groq/openai/gpt-oss-120b")
+        monkeypatch.setattr(guidelines_module.llm_settings, "custom_llm_provider", "groq")
+        monkeypatch.setattr(guidelines_module.evolve_config, "segmentation_enabled", False)
+        mock_completion.return_value = _mock_completion_response(
+            {
+                "guidelines": [
+                    {
+                        "content": "Validate files before parsing",
+                        "rationale": "Avoids parser crashes on empty inputs",
+                        "category": "strategy",
+                        "trigger": "Before reading user-provided CSV files",
+                        "implementation_steps": ["Check file size", "Return an empty DataFrame for empty files"],
+                    }
+                ]
+            }
+        )
+
+        results = generate_guidelines([{"role": "user", "content": "Fix CSV parsing"}])
+
+        assert results[0].guidelines[0].content == "Validate files before parsing"
+        _, kwargs = mock_completion.call_args
+        assert "response_format" not in kwargs
+        assert kwargs["custom_llm_provider"] == "groq"
+        assert "Output Format (JSON)" in kwargs["messages"][0]["content"]