Skip to content

Commit 2520447

Browse files
aminghadersohimichael-s-molina
authored andcommitted
fix(mcp): wire up compact schema serialization for search_tools results (#39229)
(cherry picked from commit e17cf3c)
1 parent f356541 commit 2520447

3 files changed

Lines changed: 472 additions & 2 deletions

File tree

superset/mcp_service/mcp_config.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -248,9 +248,19 @@
248248
# - "bm25": Natural language search using BM25 ranking (recommended)
249249
# - "regex": Pattern-based search using regular expressions
250250
#
251+
# Schema Compaction:
252+
# ------------------
253+
# When compact_schemas=True, search results strip $defs sections and replace
254+
# $ref pointers with {"type": "object"}, and truncate tool descriptions.
255+
# This reduces per-search token cost by ~40-60%. Full schemas remain
256+
# available when the tool is actually invoked via call_tool.
257+
#
251258
# Rollback:
252259
# ---------
253-
# Set enabled=False in superset_config.py for instant rollback.
260+
# - Set enabled=False to disable tool search entirely (full catalog exposed).
261+
# - Set compact_schemas=False to disable schema compaction only (full $defs
262+
# and descriptions in search results, tool search still active).
263+
# - Set max_description_length=0 to disable description truncation only.
254264
# =============================================================================
255265
MCP_TOOL_SEARCH_CONFIG: Dict[str, Any] = {
256266
"enabled": True, # Enabled by default — reduces initial context by ~70%
@@ -262,6 +272,8 @@
262272
],
263273
"search_tool_name": "search_tools", # Name of the search tool
264274
"call_tool_name": "call_tool", # Name of the call proxy tool
275+
"compact_schemas": True, # Strip $defs and simplify $ref in search results
276+
"max_description_length": 300, # Truncate tool descriptions (0 = no truncation)
265277
}
266278

267279

superset/mcp_service/server.py

Lines changed: 119 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,84 @@ def _strip_titles(obj: Any, in_properties_map: bool = False) -> Any:
175175
return obj
176176

177177

178+
def _simplify_optional_union(result: dict[str, Any]) -> dict[str, Any]:
179+
"""Collapse ``anyOf``/``oneOf`` with exactly one non-null variant.
180+
181+
Pydantic encodes ``Optional[X]`` as ``{"anyOf": [<X>, {"type": "null"}]}``.
182+
This replaces the union with the non-null variant while preserving any
183+
``description`` or ``default`` from the parent node.
184+
"""
185+
for union_key in ("anyOf", "oneOf"):
186+
variants = result.get(union_key)
187+
if not isinstance(variants, list) or len(variants) != 2:
188+
continue
189+
non_null = [v for v in variants if v.get("type") != "null"]
190+
if len(non_null) != 1:
191+
continue
192+
simplified = dict(non_null[0])
193+
for keep in ("description", "default"):
194+
if keep in result and keep not in simplified:
195+
simplified[keep] = result[keep]
196+
result.pop(union_key)
197+
result.pop("description", None)
198+
result.pop("default", None)
199+
result.update(simplified)
200+
return result
201+
202+
203+
def _compact_schema(obj: Any) -> Any:
204+
"""Collapse ``$defs`` and ``$ref`` pointers in a JSON Schema.
205+
206+
Search results only need enough schema detail for the LLM to identify
207+
which tool to call and construct a basic invocation. Full schemas
208+
(with all nested model definitions) are still available when the tool
209+
is actually invoked via ``call_tool``.
210+
211+
Transformations applied:
212+
213+
* ``$defs`` sections are removed entirely.
214+
* ``{"$ref": "..."}`` is replaced with ``{"type": "object"}``.
215+
* ``anyOf``/``oneOf`` lists containing only a ``$ref`` and
216+
``{"type": "null"}`` (Pydantic's Optional encoding) are collapsed
217+
to the simplified non-null variant.
218+
"""
219+
if isinstance(obj, list):
220+
return [_compact_schema(item) for item in obj]
221+
if not isinstance(obj, dict):
222+
return obj
223+
224+
# Direct $ref → generic object type
225+
if "$ref" in obj:
226+
replacement: dict[str, Any] = {"type": "object"}
227+
if desc := obj.get("description"):
228+
replacement["description"] = desc
229+
return replacement
230+
231+
result: dict[str, Any] = {}
232+
for key, value in obj.items():
233+
if key == "$defs":
234+
continue
235+
result[key] = _compact_schema(value)
236+
237+
return _simplify_optional_union(result)
238+
239+
240+
def _truncate_description(text: str, max_length: int) -> str:
241+
"""Truncate a tool description for search results.
242+
243+
Cuts at the last sentence boundary before *max_length*, or at
244+
*max_length* with an ellipsis if no sentence boundary is found.
245+
"""
246+
if not text or len(text) <= max_length:
247+
return text
248+
# Try to cut at the last sentence boundary
249+
truncated = text[:max_length]
250+
last_period = truncated.rfind(". ")
251+
if last_period > max_length // 2:
252+
return truncated[: last_period + 1]
253+
return truncated.rstrip() + "..."
254+
255+
178256
def _serialize_tools_without_output_schema(
179257
tools: Sequence[Any],
180258
) -> list[dict[str, Any]]:
@@ -194,6 +272,46 @@ def _serialize_tools_without_output_schema(
194272
return results
195273

196274

275+
def _create_search_result_serializer(
276+
config: dict[str, Any],
277+
) -> Any:
278+
"""Build a search-result serializer from the tool-search config.
279+
280+
When ``compact_schemas`` is enabled (default), the serializer applies
281+
additional compaction on top of the base serialization:
282+
283+
* ``$defs`` sections and ``$ref`` pointers are collapsed
284+
(see :func:`_compact_schema`).
285+
* Tool descriptions are truncated to ``max_description_length`` chars.
286+
287+
This reduces per-search-call token cost by ~40-60 % while keeping
288+
enough detail for the LLM to identify the right tool and construct
289+
a basic invocation.
290+
"""
291+
compact = config.get("compact_schemas", True)
292+
# Description truncation defaults to 300 when compact_schemas is on,
293+
# but is disabled when compact_schemas is off (unless explicitly set).
294+
max_desc_default = 300 if compact else 0
295+
max_desc = config.get("max_description_length", max_desc_default)
296+
297+
if not compact and not max_desc:
298+
return _serialize_tools_without_output_schema
299+
300+
def _serializer(tools: Sequence[Any]) -> list[dict[str, Any]]:
301+
results = _serialize_tools_without_output_schema(tools)
302+
for data in results:
303+
if compact:
304+
if input_schema := data.get("inputSchema"):
305+
data["inputSchema"] = _compact_schema(input_schema)
306+
if max_desc and "description" in data:
307+
data["description"] = _truncate_description(
308+
data["description"], max_desc
309+
)
310+
return results
311+
312+
return _serializer
313+
314+
197315
def _fix_call_tool_arguments(tool: Any) -> Any:
198316
"""Fix anyOf schema in call_tool ``arguments`` for MCP bridge compatibility.
199317
@@ -270,7 +388,7 @@ def _apply_tool_search_transform(mcp_instance: Any, config: dict[str, Any]) -> N
270388
"always_visible": config.get("always_visible", []),
271389
"search_tool_name": config.get("search_tool_name", "search_tools"),
272390
"call_tool_name": config.get("call_tool_name", "call_tool"),
273-
"search_result_serializer": _serialize_tools_without_output_schema,
391+
"search_result_serializer": _create_search_result_serializer(config),
274392
}
275393

276394
def _make_normalizing_call_tool(transform: Any) -> Tool:

0 commit comments

Comments
 (0)