|
4 | 4 | from helpers.modules import load_classes_from_file, load_classes_from_folder # keep here for backwards compatibility |
5 | 5 | from typing import Any |
6 | 6 |
|
| 7 | + |
| 8 | +def _sanitize_for_json_parsing(content: str) -> str: |
| 9 | + """ |
| 10 | + Remove common LLM output artifacts that break JSON parsing: |
| 11 | + markdown fences, XML-like wrappers, text outside JSON, and leading/trailing whitespace. |
| 12 | + """ |
| 13 | + if not isinstance(content, str): |
| 14 | + return "" |
| 15 | + |
| 16 | + # 1. Strip markdown code blocks (```json ... ``` and ``` ... ```) |
| 17 | + cleaned = re.sub(r'^```[a-zA-Z]*\s*\n?', '', content, flags=re.MULTILINE) |
| 18 | + cleaned = re.sub(r'\n?```\s*$', '', cleaned, flags=re.MULTILINE) |
| 19 | + |
| 20 | + # 2. Strip XML-style wrappers (<invoke>...</invoke>) |
| 21 | + cleaned = re.sub(r'<\s*invoke\b[^>]*>.*?</\s*invoke\s*>', '', cleaned, flags=re.DOTALL) |
| 22 | + cleaned = re.sub(r'<\s*/\s*invoke\s*>', '', cleaned, flags=re.DOTALL) |
| 23 | + |
| 24 | + # 3. Remove function-style wrappers (functions.tool_name:123) |
| 25 | + cleaned = re.sub(r'functions\.\w+\s*:\s*\w+\s*', '', cleaned) |
| 26 | + |
| 27 | + # 4. Remove leading text before the first '{' if any |
| 28 | + first_brace = cleaned.find('{') |
| 29 | + if first_brace > 0: |
| 30 | + cleaned = cleaned[first_brace:] |
| 31 | + |
| 32 | + return cleaned |
| 33 | + |
| 34 | + |
7 | 35 | def json_parse_dirty(json: str) -> dict[str, Any] | None: |
8 | 36 | if not json or not isinstance(json, str): |
9 | 37 | return None |
10 | 38 |
|
11 | | - ext_json = extract_json_object_string(json.strip()) |
| 39 | + sanitized = _sanitize_for_json_parsing(json) |
| 40 | + ext_json = extract_json_object_string(sanitized.strip()) |
12 | 41 | if ext_json: |
13 | 42 | try: |
14 | 43 | data = DirtyJson.parse_string(ext_json) |
|
0 commit comments