Skip to content

Commit 4e36a23

Browse files
Sodawyxclaude
andcommitted
fix: sanitize non-identifier field names in MCP/OpenAPI tool schemas
MCP / OpenAPI 工具的 JSON Schema 经常包含含 `-` 的字段名 (如 `x-access-id`、`api-version`)、Python 保留字 (`class`、`from`) 或数字开头 的字段。Pydantic 接受这类字段名, 但下游 `inspect.Parameter` 会抛 ValueError 导致整个工具加载失败、被静默丢弃。 本提交把 JSON Schema → Pydantic 的转换层加上字段名 sanitizer: 内部用合法 Python 标识符做 Pydantic 字段名 (`x_access_id`), 通过 `alias` 同时保留原名给 JSON Schema 输出和 MCP 调用使用。配合 `populate_by_name=True`, 两种写法都能验证通过, `model_dump(by_alias=True)` 确保实际下发到 MCP backend 的字段名仍是原始名 `x-access-id`。 同步给 `_create_function_with_signature` 的 alias 循环加上防御性 sanitize, 避免未来扩展 `__agentrun_argument_aliases__` 时再次踩坑。 新增 13 个回归测试覆盖: 含 `-` / `.` 的字段名、数字开头、保留字 (`class`)、 空串、`_build_tool_from_meta` 端到端链路、alias 循环防御。 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 1435314 commit 4e36a23

2 files changed

Lines changed: 225 additions & 8 deletions

File tree

agentrun/integration/utils/tool.py

Lines changed: 84 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
from pydantic import (
4242
AliasChoices,
4343
BaseModel,
44+
ConfigDict,
4445
create_model,
4546
Field,
4647
ValidationError,
@@ -1396,6 +1397,7 @@ def _create_function_with_signature(
13961397
args_schema, "__agentrun_argument_aliases__", {}
13971398
)
13981399
if alias_map:
1400+
existing_param_names = {p.name for p in parameters}
13991401
for alias, canonical in alias_map.items():
14001402
canonical_field = args_schema.model_fields.get(canonical)
14011403
alias_annotation = (
@@ -1408,9 +1410,20 @@ def _create_function_with_signature(
14081410
and alias_annotation is not None
14091411
):
14101412
alias_annotation = Optional[alias_annotation]
1413+
# 防御性 sanitize: alias 同样要落到 inspect.Parameter 上, 非法字符
1414+
# (如 ``x-access-id``)会触发 ValueError。当前 alias 仅由
1415+
# ``_maybe_add_body_alias`` 写入 "query", 但未来可能扩展。
1416+
alias_name = (
1417+
alias
1418+
if alias.isidentifier()
1419+
else _sanitize_python_identifier(alias)
1420+
)
1421+
if alias_name in existing_param_names:
1422+
continue
1423+
existing_param_names.add(alias_name)
14111424
parameters.append(
14121425
inspect.Parameter(
1413-
alias,
1426+
alias_name,
14141427
inspect.Parameter.KEYWORD_ONLY,
14151428
default=None,
14161429
annotation=alias_annotation,
@@ -1425,7 +1438,9 @@ def impl(**kwargs):
14251438
if args_schema is not None:
14261439
try:
14271440
parsed = args_schema(**normalized_kwargs)
1428-
payload = parsed.model_dump(mode="python", exclude_unset=True)
1441+
payload = parsed.model_dump(
1442+
mode="python", exclude_unset=True, by_alias=True
1443+
)
14291444
except ValidationError as exc:
14301445
raise ValueError(
14311446
f"Invalid arguments for tool '{tool_name}': {exc}"
@@ -1674,6 +1689,33 @@ def _build_openapi_schema(
16741689
return schema, tuple(body_field_names), alias_map
16751690

16761691

1692+
_PY_KEYWORDS: Set[str] = set()
1693+
1694+
1695+
def _sanitize_python_identifier(name: str) -> str:
1696+
"""将任意字符串转换为合法的 Python 标识符
1697+
1698+
用于把 JSON Schema 中含 ``-`` / ``.`` 等字符的字段名(例如 ``x-access-id``)
1699+
映射成 Pydantic / ``inspect.Parameter`` 都能接受的字段名。原始名通过 alias
1700+
继续保留在 JSON Schema 和实际调用中。
1701+
"""
1702+
import keyword
1703+
1704+
if not _PY_KEYWORDS:
1705+
_PY_KEYWORDS.update(keyword.kwlist)
1706+
1707+
sanitized = re.sub(r"[^0-9a-zA-Z_]", "_", name)
1708+
sanitized = sanitized.lstrip("_")
1709+
if not sanitized:
1710+
sanitized = "field"
1711+
if sanitized[0].isdigit():
1712+
# Pydantic 不允许字段名以下划线开头, 因此用字母前缀.
1713+
sanitized = "field_" + sanitized
1714+
if sanitized in _PY_KEYWORDS:
1715+
sanitized = sanitized + "_"
1716+
return sanitized
1717+
1718+
16771719
def _json_schema_to_pydantic(
16781720
name: str,
16791721
schema: Optional[Dict[str, Any]],
@@ -1688,40 +1730,74 @@ def _json_schema_to_pydantic(
16881730

16891731
required_fields = set(schema.get("required", []))
16901732
fields = {}
1733+
needs_populate_by_name = False
1734+
used_py_names: Set[str] = set()
16911735

16921736
for field_name, field_schema in properties.items():
16931737
if not isinstance(field_schema, dict):
16941738
continue
16951739

1740+
# 把含非法字符(如 ``x-access-id``)或保留字(``class``)的字段名映射到
1741+
# 合法的 Python 标识符, 通过 alias 保留原名以便 JSON Schema 输出和
1742+
# 调用真实 MCP 工具时使用。
1743+
import keyword as _kw
1744+
1745+
if field_name.isidentifier() and not _kw.iskeyword(field_name):
1746+
py_name = field_name
1747+
else:
1748+
py_name = _sanitize_python_identifier(field_name)
1749+
if py_name in used_py_names:
1750+
suffix = 2
1751+
while f"{py_name}_{suffix}" in used_py_names:
1752+
suffix += 1
1753+
py_name = f"{py_name}_{suffix}"
1754+
used_py_names.add(py_name)
1755+
if py_name != field_name:
1756+
needs_populate_by_name = True
1757+
16961758
# 映射类型
16971759
field_type = _json_type_to_python(field_schema)
16981760
description = field_schema.get("description", "")
16991761
default = field_schema.get("default")
17001762
aliases = field_schema.get("x-aliases")
17011763
field_kwargs: Dict[str, Any] = {"description": description}
1764+
1765+
# 用 ``alias`` 同时作用于 JSON Schema 输出和 by_alias dump,
1766+
# 让 LLM/调用端看到的字段名仍是原始名(如 ``x-access-id``)。
1767+
if py_name != field_name:
1768+
field_kwargs["alias"] = field_name
17021769
if aliases:
17031770
if not isinstance(aliases, (list, tuple)):
17041771
aliases = [aliases]
1705-
field_kwargs["validation_alias"] = AliasChoices(
1706-
field_name, *aliases
1707-
)
1772+
alias_choices: List[str] = [field_name]
1773+
if py_name != field_name:
1774+
alias_choices.append(py_name)
1775+
for alias in aliases:
1776+
if alias and alias not in alias_choices:
1777+
alias_choices.append(alias)
1778+
field_kwargs["validation_alias"] = AliasChoices(*alias_choices)
17081779

17091780
# 构建字段定义
17101781
if field_name in required_fields:
17111782
# 必填字段
1712-
fields[field_name] = (field_type, Field(**field_kwargs))
1783+
fields[py_name] = (field_type, Field(**field_kwargs))
17131784
else:
17141785
# 可选字段
17151786
from typing import Optional as TypingOptional
17161787

1717-
fields[field_name] = (
1788+
fields[py_name] = (
17181789
TypingOptional[field_type],
17191790
Field(default=default, **field_kwargs),
17201791
)
17211792

17221793
# 创建模型,清理名称
17231794
model_name = re.sub(r"[^0-9a-zA-Z]", "", name.title())
1724-
return create_model(model_name or "Args", **fields) # type: ignore
1795+
model_kwargs: Dict[str, Any] = {}
1796+
if needs_populate_by_name:
1797+
model_kwargs["__config__"] = ConfigDict(populate_by_name=True)
1798+
return create_model( # type: ignore
1799+
model_name or "Args", **model_kwargs, **fields
1800+
)
17251801

17261802

17271803
def _json_type_to_python(field_schema: Dict[str, Any]) -> type:

tests/unittests/integration/test_tool_utils.py

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,14 @@
1010
import pytest
1111

1212
from agentrun.integration.utils.tool import (
13+
_build_tool_from_meta,
14+
_create_function_with_signature,
1315
_extract_core_schema,
16+
_json_schema_to_pydantic,
1417
_load_json,
1518
_merge_schema_dicts,
1619
_normalize_tool_arguments,
20+
_sanitize_python_identifier,
1721
_to_dict,
1822
CommonToolSet,
1923
from_pydantic,
@@ -702,3 +706,140 @@ def test_get_schema_from_parameters(self):
702706
assert "name" in schema["properties"]
703707
assert "age" in schema["properties"]
704708
assert "name" in schema.get("required", [])
709+
710+
711+
class TestSanitizePythonIdentifier:
712+
"""测试字段名 sanitizer"""
713+
714+
def test_valid_identifier_unchanged(self):
715+
assert _sanitize_python_identifier("normal_name") == "normal_name"
716+
717+
def test_hyphenated_name(self):
718+
assert _sanitize_python_identifier("x-access-id") == "x_access_id"
719+
720+
def test_dotted_name(self):
721+
assert _sanitize_python_identifier("a.b.c") == "a_b_c"
722+
723+
def test_leading_digit_prefixed(self):
724+
assert _sanitize_python_identifier("123abc") == "field_123abc"
725+
726+
def test_keyword_suffixed(self):
727+
assert _sanitize_python_identifier("class") == "class_"
728+
729+
def test_empty_string(self):
730+
assert _sanitize_python_identifier("") == "field"
731+
732+
def test_only_invalid_chars(self):
733+
assert _sanitize_python_identifier("---") == "field"
734+
735+
736+
class TestJsonSchemaToPydanticInvalidFieldNames:
737+
"""覆盖 _json_schema_to_pydantic 对非法 Python 标识符字段名的处理"""
738+
739+
def test_hyphenated_field_name_builds_model(self):
740+
"""字段名含 '-' 时不应抛错, 且 JSON Schema 仍以原名暴露"""
741+
schema = {
742+
"type": "object",
743+
"properties": {
744+
"x-access-id": {"type": "string", "description": "id"},
745+
},
746+
"required": ["x-access-id"],
747+
}
748+
749+
model = _json_schema_to_pydantic("Args", schema)
750+
751+
assert model is not None
752+
assert "x_access_id" in model.model_fields
753+
json_schema = model.model_json_schema()
754+
assert "x-access-id" in json_schema["properties"]
755+
assert "x-access-id" in json_schema["required"]
756+
757+
def test_keyword_field_name_sanitized(self):
758+
schema = {
759+
"type": "object",
760+
"properties": {
761+
"class": {"type": "string", "description": "py keyword"},
762+
},
763+
}
764+
765+
model = _json_schema_to_pydantic("Args", schema)
766+
767+
assert model is not None
768+
assert "class_" in model.model_fields
769+
assert "class" in model.model_json_schema()["properties"]
770+
771+
def test_accepts_both_original_and_sanitized_name(self):
772+
schema = {
773+
"type": "object",
774+
"properties": {
775+
"x-access-id": {"type": "string"},
776+
},
777+
"required": ["x-access-id"],
778+
}
779+
780+
model = _json_schema_to_pydantic("Args", schema)
781+
782+
# 原名: 通过 alias
783+
m1 = model(**{"x-access-id": "v1"})
784+
assert m1.model_dump(by_alias=True) == {"x-access-id": "v1"}
785+
# 沙化名: 通过 populate_by_name
786+
m2 = model(x_access_id="v2")
787+
assert m2.model_dump(by_alias=True) == {"x-access-id": "v2"}
788+
789+
790+
class TestCreateFunctionWithSignatureAliasSanitization:
791+
"""覆盖 _create_function_with_signature 对非法 alias 名的防御处理"""
792+
793+
def test_alias_with_hyphen_sanitized(self):
794+
"""`__agentrun_argument_aliases__` 含非法标识符 alias 时不应崩溃"""
795+
from pydantic import BaseModel as _BM
796+
797+
class _Args(_BM):
798+
query: str
799+
800+
setattr(_Args, "__agentrun_argument_aliases__", {"x-alias": "query"})
801+
802+
toolset = MagicMock()
803+
func = _create_function_with_signature("demo", _Args, toolset, None)
804+
805+
import inspect as _inspect
806+
807+
sig = _inspect.signature(func)
808+
# 主字段保留, alias 被 sanitize
809+
assert "query" in sig.parameters
810+
assert "x_alias" in sig.parameters
811+
812+
813+
class TestBuildToolFromMetaInvalidFieldNames:
814+
"""覆盖 _build_tool_from_meta 完整链路 (回归 'x-access-id' 加载失败)"""
815+
816+
def test_mcp_input_schema_with_hyphen_field(self):
817+
"""模拟 MCP 工具元数据包含 'x-access-id' 入参时仍可成功构造 Tool"""
818+
toolset = MagicMock()
819+
toolset.call_tool = MagicMock(return_value={"status": "ok"})
820+
821+
meta = {
822+
"name": "demo-tool",
823+
"description": "demo",
824+
"input_schema": {
825+
"type": "object",
826+
"properties": {
827+
"x-access-id": {
828+
"type": "string",
829+
"description": "id",
830+
},
831+
"value": {"type": "integer"},
832+
},
833+
"required": ["x-access-id"],
834+
},
835+
}
836+
837+
tool_obj = _build_tool_from_meta(toolset, meta, None)
838+
839+
assert tool_obj is not None
840+
# 调用工具时, MCP 应收到原始字段名 'x-access-id'
841+
tool_obj.func(**{"x-access-id": "abc", "value": 1})
842+
toolset.call_tool.assert_called_once()
843+
call_kwargs = toolset.call_tool.call_args.kwargs
844+
assert call_kwargs["arguments"]["x-access-id"] == "abc"
845+
assert call_kwargs["arguments"]["value"] == 1

0 commit comments

Comments
 (0)