Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 16 additions & 4 deletions sdk/nexent/core/agents/nexent_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,11 +198,16 @@ def create_local_tool(self, tool_config: ToolConfig):
raise ValueError(f"{class_name} not found in local")
else:
if class_name == "KnowledgeBaseSearchTool":
# Filter out conflicting parameters from params to avoid conflicts
# These parameters have exclude=True and cannot be passed to __init__
# due to smolagents.tools.Tool wrapper restrictions
# Filter out conflicting parameters from params to avoid conflicts.
# Parameters declared with exclude=True cannot be passed to __init__
# due to smolagents.tools.Tool wrapper restrictions; they are set as
# attributes on the instance after construction, sourced from metadata.
# `document_paths` is intentionally hidden from the LLM and only
# populated via tool_params from the northbound interface.
filtered_params = {k: v for k, v in params.items()
if k not in ["vdb_core", "embedding_model", "observer", "rerank_model", "display_name_to_index_map"]}
if k not in ["vdb_core", "embedding_model", "observer",
"rerank_model", "display_name_to_index_map",
"document_paths"]}
# Create instance with only non-excluded parameters
tools_obj = tool_class(**filtered_params)
# Set excluded parameters directly as attributes after instantiation
Expand All @@ -216,6 +221,13 @@ def create_local_tool(self, tool_config: ToolConfig):
"rerank_model", None) if tool_config.metadata else None
tools_obj.display_name_to_index_map = tool_config.metadata.get(
"display_name_to_index_map", {}) if tool_config.metadata else {}
# Internal access control: restrict results to documents whose
# path_or_url is in the allow list. Only the northbound interface
# may populate this; never the LLM.
tools_obj.set_document_paths(
tool_config.metadata.get(
"document_paths") if tool_config.metadata else None

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[P1] metadata 里的 document_paths 被直接送进访问控制过滤器,没有校验来源和类型。只要 metadata 被错误构造为字符串或混合 URL,就会影响检索授权;这里应在 agent 侧先 validate/canonicalize。

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[P1] metadata 里的 document_paths 被直接送进访问控制过滤器,没有校验来源和类型。只要 metadata 被错误构造为字符串或混合 URL,就会影响检索授权;这里应在 agent 侧先 validate/canonicalize。

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

事后审查补充:[P1] metadata 里的 document_paths 被直接送进访问控制过滤器,没有校验来源和类型。只要 metadata 被错误构造为字符串或混合 URL,就会影响检索授权;这里应在 agent 侧先 validate/canonicalize。

影响:这个问题合入后会在对应部署、运行或权限场景中留下真实故障/安全风险,后续排查成本较高。
建议:沿着上述风险点补齐校验、配置来源、权限边界或回归测试,避免同类问题再次出现。

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

二次事后审查补充:[P1] metadata 里的 document_paths 被直接送进访问控制过滤器,没有校验来源和类型。只要 metadata 被错误构造为字符串或混合 URL,就会影响检索授权;这里应在 agent 侧先 validate/canonicalize。

影响:该问题合入后仍可能在真实部署、运行、权限或测试场景中形成回归风险。
建议:后续按这个风险点补齐边界校验、配置来源收敛、权限约束或针对性回归测试。

)
elif class_name in ["DifySearchTool", "DataMateSearchTool"]:
# These parameters have exclude=True and cannot be passed to __init__
filtered_params = {k: v for k, v in params.items()
Expand Down
24 changes: 21 additions & 3 deletions sdk/nexent/core/tools/knowledge_base_search_tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,21 @@
logger = logging.getLogger("knowledge_base_search_tool")


def _unwrap_field_info(value):

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_unwrap_field_info 只处理了 FieldInfo 的一层包装。如果存在嵌套的 FieldInfo(例如 Field(default=Field(default=...))),当前实现无法递归解包。虽然目前不太可能出现嵌套情况,但建议在函数文档中明确说明仅支持单层解包,或添加递归处理。

"""Resolve a value that may be wrapped in a Pydantic FieldInfo.

Parameters declared with `Field(...)` and `exclude=True` are not expanded by
smolagents' Tool wrapper, so they arrive at `__init__` as raw FieldInfo
instances instead of their declared defaults. This helper extracts the
concrete value so callers can safely treat the result as plain data.
"""
if isinstance(value, FieldInfo):
if value.default_factory is not None:
return value.default_factory()

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[P2] default_factory 每次 unwrap 都会执行,包括过滤阶段再次调用时。如果 factory 生成的是可变列表或有副作用,允许列表会在一次请求内变化;建议在 init/set_document_paths 时解析一次并缓存具体值。

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[P2] default_factory 每次 unwrap 都会执行,包括过滤阶段再次调用时。如果 factory 生成的是可变列表或有副作用,允许列表会在一次请求内变化;建议在 init/set_document_paths 时解析一次并缓存具体值。

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

事后审查补充:[P2] default_factory 每次 unwrap 都会执行,包括过滤阶段再次调用时。如果 factory 生成的是可变列表或有副作用,允许列表会在一次请求内变化;建议在 init/set_document_paths 时解析一次并缓存具体值。

影响:这个问题合入后会在对应部署、运行或权限场景中留下真实故障/安全风险,后续排查成本较高。
建议:沿着上述风险点补齐校验、配置来源、权限边界或回归测试,避免同类问题再次出现。

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

二次事后审查补充:[P2] default_factory 每次 unwrap 都会执行,包括过滤阶段再次调用时。如果 factory 生成的是可变列表或有副作用,允许列表会在一次请求内变化;建议在 init/set_document_paths 时解析一次并缓存具体值。

影响:该问题合入后仍可能在真实部署、运行、权限或测试场景中形成回归风险。
建议:后续按这个风险点补齐边界校验、配置来源收敛、权限约束或针对性回归测试。

return value.default
return value

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[P1] 这里直接返回原值,没有保证 document_paths 是 list[str]。如果外部传入字符串,后续 membership 会变成对子串的检查,可能错误放行或丢弃结果;需要显式校验/转换类型。

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[P1] 这里直接返回原值,没有保证 document_paths 是 list[str]。如果外部传入字符串,后续 membership 会变成对子串的检查,可能错误放行或丢弃结果;需要显式校验/转换类型。

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

事后审查补充:[P1] 这里直接返回原值,没有保证 document_paths 是 list[str]。如果外部传入字符串,后续 membership 会变成对子串的检查,可能错误放行或丢弃结果;需要显式校验/转换类型。

影响:这个问题合入后会在对应部署、运行或权限场景中留下真实故障/安全风险,后续排查成本较高。
建议:沿着上述风险点补齐校验、配置来源、权限边界或回归测试,避免同类问题再次出现。

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

二次事后审查补充:[P1] 这里直接返回原值,没有保证 document_paths 是 list[str]。如果外部传入字符串,后续 membership 会变成对子串的检查,可能错误放行或丢弃结果;需要显式校验/转换类型。

影响:该问题合入后仍可能在真实部署、运行、权限或测试场景中形成回归风险。
建议:后续按这个风险点补齐边界校验、配置来源收敛、权限约束或针对性回归测试。



class KnowledgeBaseSearchTool(Tool):
"""Knowledge base search tool"""

Expand Down Expand Up @@ -129,7 +144,10 @@ def __init__(
self.rerank_model = rerank_model
self.data_process_service = os.getenv("DATA_PROCESS_SERVICE")
self.display_name_to_index_map = display_name_to_index_map
self._internal_document_paths = document_paths
# `document_paths` is declared with `exclude=True` so smolagents passes the
# raw FieldInfo default when no value is supplied. Unwrap it here so the
# internal filter is always a concrete list (or None), never a FieldInfo.
self._internal_document_paths = _unwrap_field_info(document_paths)

self.record_ops = 1
self.running_prompt_zh = "知识库检索中..."
Expand All @@ -144,7 +162,7 @@ def set_document_paths(self, document_paths: Optional[List[str]]) -> None:
Args:
document_paths: List of allowed document path_or_urls. If None, no filtering is applied.
"""
self._internal_document_paths = document_paths
self._internal_document_paths = _unwrap_field_info(document_paths)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[P1] set_document_paths 仍然接受任意值并直接保存 unwrap 结果。这个字段承担访问控制,不能只处理 FieldInfo;应该拒绝非 list[str],并把空列表保留为“禁止所有文档”而不是 None。

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[P1] set_document_paths 仍然接受任意值并直接保存 unwrap 结果。这个字段承担访问控制,不能只处理 FieldInfo;应该拒绝非 list[str],并把空列表保留为“禁止所有文档”而不是 None。

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

事后审查补充:[P1] set_document_paths 仍然接受任意值并直接保存 unwrap 结果。这个字段承担访问控制,不能只处理 FieldInfo;应该拒绝非 list[str],并把空列表保留为“禁止所有文档”而不是 None。

影响:这个问题合入后会在对应部署、运行或权限场景中留下真实故障/安全风险,后续排查成本较高。
建议:沿着上述风险点补齐校验、配置来源、权限边界或回归测试,避免同类问题再次出现。

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

二次事后审查补充:[P1] set_document_paths 仍然接受任意值并直接保存 unwrap 结果。这个字段承担访问控制,不能只处理 FieldInfo;应该拒绝非 list[str],并把空列表保留为“禁止所有文档”而不是 None。

影响:该问题合入后仍可能在真实部署、运行、权限或测试场景中形成回归风险。
建议:后续按这个风险点补齐边界校验、配置来源收敛、权限约束或针对性回归测试。


def _convert_to_index_names(self, names: List[str]) -> List[str]:
"""Convert display names (knowledge_name) to index names if necessary.
Expand Down Expand Up @@ -188,7 +206,7 @@ def _filter_by_document_paths(self, results: List[dict]) -> List[dict]:
Returns:
Filtered list containing only results with allowed document paths
"""
allowed_paths = self._internal_document_paths
allowed_paths = _unwrap_field_info(self._internal_document_paths)

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_unwrap_field_info 被调用了 3 次(__init__set_document_paths_filter_by_document_paths),每次都对同一个属性做类型检查。建议在 __init__set_document_paths 中 unwrap 后存储为普通值,_filter_by_document_paths 就不需要再 unwrap 了。当前的防御性重复调用虽然安全,但增加了理解成本。

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[代码规范] _unwrap_field_info_filter_by_document_paths 中每次调用时都被重复执行,但 self._internal_document_paths__init__set_document_paths 中已经被 unwrap 过了。建议在赋值时就保证值是 unwrap 的,此处无需再次调用,避免不必要的运行时开销,也让数据流更清晰。

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_filter_by_document_paths 中每次都调用 _unwrap_field_info 来解包 self._internal_document_paths,但正常情况下该值在 __init__set_document_paths 中已经被解包过了。这里的重复解包是不必要的性能开销。建议只在 set_document_paths__init__ 中解包一次,此处直接使用 self._internal_document_paths

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[P1] 过滤前没有把 path_or_url 和 allowed_paths 规范到同一种对象路径。s3://、/nexent/、presigned URL 三种形式会互相不匹配,访问控制结果取决于上游返回格式。

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[P1] 过滤前没有把 path_or_url 和 allowed_paths 规范到同一种对象路径。s3://、/nexent/、presigned URL 三种形式会互相不匹配,访问控制结果取决于上游返回格式。

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

事后审查补充:[P1] 过滤前没有把 path_or_url 和 allowed_paths 规范到同一种对象路径。s3://、/nexent/、presigned URL 三种形式会互相不匹配,访问控制结果取决于上游返回格式。

影响:这个问题合入后会在对应部署、运行或权限场景中留下真实故障/安全风险,后续排查成本较高。
建议:沿着上述风险点补齐校验、配置来源、权限边界或回归测试,避免同类问题再次出现。

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

二次事后审查补充:[P1] 过滤前没有把 path_or_url 和 allowed_paths 规范到同一种对象路径。s3://、/nexent/、presigned URL 三种形式会互相不匹配,访问控制结果取决于上游返回格式。

影响:该问题合入后仍可能在真实部署、运行、权限或测试场景中形成回归风险。
建议:后续按这个风险点补齐边界校验、配置来源收敛、权限约束或针对性回归测试。

if not allowed_paths:
return results

Expand Down
82 changes: 82 additions & 0 deletions test/sdk/core/agents/test_nexent_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -939,6 +939,88 @@ def test_create_local_tool_knowledge_base_with_display_name_map(nexent_agent_ins
assert result.rerank_model == "mock_rerank_model"


def test_create_local_tool_knowledge_base_with_document_paths_from_metadata(nexent_agent_instance):
"""KnowledgeBaseSearchTool should receive document_paths from metadata via set_document_paths.

The `document_paths` parameter is declared with `exclude=True` so it must not
be passed to __init__. Instead it must be forwarded to `set_document_paths`
on the instance, sourced from `tool_config.metadata`. This guards against
the FieldInfo-iteration regression reported when document_paths is unset.
"""
mock_kb_tool_class = MagicMock()
mock_kb_tool_instance = MagicMock()
mock_kb_tool_class.return_value = mock_kb_tool_instance

document_paths = ["s3://bucket/doc1.txt", "s3://bucket/doc2.txt"]

tool_config = ToolConfig(
class_name="KnowledgeBaseSearchTool",
name="knowledge_base_search",
description="desc",
inputs="{}",
output_type="string",
params={"top_k": 5, "index_names": ["kb1"]},
source="local",
metadata={
"vdb_core": "mock_vdb_core",
"embedding_model": "mock_embedding_model",
"document_paths": document_paths,
},
)

original_value = nexent_agent.__dict__.get("KnowledgeBaseSearchTool")
nexent_agent.__dict__["KnowledgeBaseSearchTool"] = mock_kb_tool_class

try:
nexent_agent_instance.create_local_tool(tool_config)
finally:
if original_value is not None:
nexent_agent.__dict__["KnowledgeBaseSearchTool"] = original_value
elif "KnowledgeBaseSearchTool" in nexent_agent.__dict__:
del nexent_agent.__dict__["KnowledgeBaseSearchTool"]

# document_paths is excluded and must not be forwarded to __init__.
init_kwargs = mock_kb_tool_class.call_args.kwargs
assert "document_paths" not in init_kwargs
# It must instead be applied via set_document_paths on the instance.
mock_kb_tool_instance.set_document_paths.assert_called_once_with(document_paths)


def test_create_local_tool_knowledge_base_without_metadata_calls_set_document_paths_none(nexent_agent_instance):
"""When metadata lacks document_paths, set_document_paths(None) must still be invoked.

Ensures the tool's internal filter is explicitly reset to None rather than
left as a stale FieldInfo default from the smolagents wrapper.
"""
mock_kb_tool_class = MagicMock()
mock_kb_tool_instance = MagicMock()
mock_kb_tool_class.return_value = mock_kb_tool_instance

tool_config = ToolConfig(
class_name="KnowledgeBaseSearchTool",
name="knowledge_base_search",
description="desc",
inputs="{}",
output_type="string",
params={"top_k": 5, "index_names": ["kb1"]},
source="local",
metadata=None,
)

original_value = nexent_agent.__dict__.get("KnowledgeBaseSearchTool")
nexent_agent.__dict__["KnowledgeBaseSearchTool"] = mock_kb_tool_class

try:
nexent_agent_instance.create_local_tool(tool_config)
finally:
if original_value is not None:
nexent_agent.__dict__["KnowledgeBaseSearchTool"] = original_value
elif "KnowledgeBaseSearchTool" in nexent_agent.__dict__:
del nexent_agent.__dict__["KnowledgeBaseSearchTool"]

mock_kb_tool_instance.set_document_paths.assert_called_once_with(None)


def test_create_local_tool_knowledge_base_with_empty_display_name_map(nexent_agent_instance):
"""Test KnowledgeBaseSearchTool creation handles empty display_name_to_index_map."""
mock_kb_tool_class = MagicMock()
Expand Down
88 changes: 88 additions & 0 deletions test/sdk/core/tools/test_knowledge_base_search_tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -1776,3 +1776,91 @@ def test_forward_with_document_paths_filter_no_results_after_filter(self, mock_v

assert "No results found" in str(excinfo.value)

def test_filter_by_document_paths_unwraps_fieldinfo_default(self, mock_vdb_core, mock_embedding_model):
"""Filter should tolerate a FieldInfo default instead of a concrete list.

Regression: smolagents' Tool wrapper does not expand FieldInfo defaults for
parameters declared with `exclude=True`, so `self._internal_document_paths`
may arrive as a FieldInfo. The filter must unwrap it instead of failing with
`TypeError: argument of type 'FieldInfo' is not iterable`.
"""
try:
from pydantic import FieldInfo
except ImportError:
from pydantic.fields import FieldInfo

field_info_default = FieldInfo(default=["s3://bucket/doc1.txt"])

tool = KnowledgeBaseSearchTool(
index_names=["kb1"],
search_mode="hybrid",
vdb_core=mock_vdb_core,
embedding_model=mock_embedding_model,
document_paths=None,
)
# Simulate a FieldInfo being assigned directly (e.g. from smolagents wrapper).
tool._internal_document_paths = field_info_default

results = self._create_mock_formatted_results_with_paths(
["s3://bucket/doc1.txt", "s3://bucket/doc2.txt"]
)
filtered = tool._filter_by_document_paths(results)

assert len(filtered) == 1
assert filtered[0]["path_or_url"] == "s3://bucket/doc1.txt"

def test_filter_by_document_paths_unwraps_fieldinfo_default_factory(self, mock_vdb_core, mock_embedding_model):
"""Filter should tolerate a FieldInfo with default_factory."""
try:
from pydantic import FieldInfo
except ImportError:
from pydantic.fields import FieldInfo

field_info_factory = FieldInfo(
default_factory=lambda: ["s3://bucket/doc2.txt"]
)

tool = KnowledgeBaseSearchTool(
index_names=["kb1"],
search_mode="hybrid",
vdb_core=mock_vdb_core,
embedding_model=mock_embedding_model,
document_paths=None,
)
tool._internal_document_paths = field_info_factory

results = self._create_mock_formatted_results_with_paths(
["s3://bucket/doc1.txt", "s3://bucket/doc2.txt"]
)
filtered = tool._filter_by_document_paths(results)

assert len(filtered) == 1
assert filtered[0]["path_or_url"] == "s3://bucket/doc2.txt"

def test_set_document_paths_unwraps_fieldinfo(self, mock_vdb_core, mock_embedding_model):
"""set_document_paths should also accept FieldInfo input defensively."""
try:
from pydantic import FieldInfo
except ImportError:
from pydantic.fields import FieldInfo

tool = KnowledgeBaseSearchTool(
index_names=["kb1"],
search_mode="hybrid",
vdb_core=mock_vdb_core,
embedding_model=mock_embedding_model,
document_paths=None,
)

field_info = FieldInfo(default=["s3://bucket/doc1.txt"])
tool.set_document_paths(field_info)

results = self._create_mock_formatted_results_with_paths(
["s3://bucket/doc1.txt", "s3://bucket/doc2.txt"]
)
filtered = tool._filter_by_document_paths(results)

assert len(filtered) == 1
assert filtered[0]["path_or_url"] == "s3://bucket/doc1.txt"


Loading