fix(json_parser): tolerate dict/list input in validate_and_repair_json

offendingcommit · offendingcommit · commit 1eca869e5131 · 2026-05-06T13:48:04.000-05:00
Some providers return chat.completions message.content as an already-parsed
object rather than a JSON-encoded string — observed with Cloudflare Workers
AI on the /compat route for @cf/meta/llama-4-scout-17b-16e-instruct under
response_format=json_schema. The OpenAI Python SDK raises a Pydantic
ValidationError ("JSON input should be string, bytes or bytearray"), and
honcho's repair fallback _parse_or_repair_structured_content then routes to
validate_and_repair_json — which crashed at json_str.strip() with
"'dict' object has no attribute 'strip'".

Encode non-string input via json.dumps() at the function boundary so the
rest of the repair pipeline keeps assuming string input. Loosen the
parameter type to Any to reflect the new contract.

Adds tests/utils/test_json_parser.py with regression coverage for dict,
list, and nested-dict inputs (plus existing string + whitespace cases).
diff --git a/src/utils/json_parser.py b/src/utils/json_parser.py
@@ -352,8 +352,18 @@ def simple_bracket_repair(json_str: str) -> str:
     return repaired
 
 
-def validate_and_repair_json(json_str: str) -> str:
-    """Main function with comprehensive repair strategies"""
+def validate_and_repair_json(json_str: Any) -> str:
+    """Main function with comprehensive repair strategies.
+
+    Accepts a string (the normal case) or a dict/list, since some providers
+    (e.g. Cloudflare Workers AI on the /compat route for llama-4-scout)
+    return chat.completions content as an already-parsed object instead of
+    a JSON-encoded string. JSON-encode non-string input first so the rest
+    of the repair pipeline (which assumes string operations like .strip())
+    keeps working.
+    """
+    if not isinstance(json_str, str):
+        json_str = json.dumps(json_str)
     json_str = json_str.strip()
 
     # Try parsing with repair library
diff --git a/tests/utils/test_json_parser.py b/tests/utils/test_json_parser.py
@@ -0,0 +1,49 @@
+"""Tests for src/utils/json_parser.py."""
+
+import json
+
+import pytest
+
+from src.utils.json_parser import validate_and_repair_json
+
+
+class TestValidateAndRepairJsonInputTypes:
+    """Regression tests for non-string inputs.
+
+    Some providers (Cloudflare Workers AI /compat for llama-4-scout)
+    return chat.completions content as an already-parsed dict instead
+    of a JSON-encoded string. The repair pipeline must JSON-encode the
+    input first instead of crashing on .strip().
+    """
+
+    def test_dict_input_produces_same_result_as_equivalent_string(self):
+        payload = {"explicit": [{"content": "user_alice exists"}]}
+        result_from_dict = validate_and_repair_json(payload)
+        result_from_str = validate_and_repair_json(json.dumps(payload))
+        assert json.loads(result_from_dict) == json.loads(result_from_str)
+
+    def test_list_input_does_not_crash(self):
+        payload = [{"fact": "a"}, {"fact": "b"}]
+        result = validate_and_repair_json(payload)
+        assert json.loads(result) == payload
+
+    def test_nested_dict_input(self):
+        payload = {
+            "explicit": [
+                {"content": "fact one"},
+                {"content": "fact two"},
+            ],
+            "implicit": [],
+        }
+        result = validate_and_repair_json(payload)
+        assert json.loads(result) == payload
+
+    def test_string_input_still_works(self):
+        payload = '{"key": "value"}'
+        result = validate_and_repair_json(payload)
+        assert json.loads(result) == {"key": "value"}
+
+    def test_string_input_with_whitespace_still_stripped(self):
+        payload = '   {"key": "value"}   \n'
+        result = validate_and_repair_json(payload)
+        assert json.loads(result) == {"key": "value"}