telemetryflow
diff --git a/‎app/agent/investigation.py‎
Lines changed: 51 additions & 4 deletions b/‎app/agent/investigation.py‎
Lines changed: 51 additions & 4 deletions
diff --git a/‎app/agent/llm_invoke_errors.py‎
Lines changed: 2 additions & 2 deletions b/‎app/agent/llm_invoke_errors.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎app/pipeline/pipeline.py‎
Lines changed: 19 additions & 3 deletions b/‎app/pipeline/pipeline.py‎
Lines changed: 19 additions & 3 deletions
diff --git a/‎app/pipeline/runners.py‎
Lines changed: 13 additions & 2 deletions b/‎app/pipeline/runners.py‎
Lines changed: 13 additions & 2 deletions
diff --git a/‎app/services/agent_llm_client.py‎
Lines changed: 4 additions & 3 deletions b/‎app/services/agent_llm_client.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎app/tools/EKSListClustersTool/__init__.py‎
Lines changed: 0 additions & 7 deletions b/‎app/tools/EKSListClustersTool/__init__.py‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎app/tools/registry.py‎
Lines changed: 69 additions & 24 deletions b/‎app/tools/registry.py‎
Lines changed: 69 additions & 24 deletions
diff --git a/‎app/tools/utils/availability.py‎
Lines changed: 6 additions & 9 deletions b/‎app/tools/utils/availability.py‎
Lines changed: 6 additions & 9 deletions
@@ -77,6 +77,33 @@
 class ConnectedInvestigationAgent:
     """ReAct loop scoped to the tools enabled by connected integrations."""
 
+    def _should_accept_conclusion(
+        self,
+        *,
+        evidence_count: int,  # noqa: ARG002 — used by overrides
+        iteration: int,  # noqa: ARG002 — used by overrides
+    ) -> tuple[bool, str | None]:
+        """Hook: decide what to do when the LLM stops requesting tools.
+
+        Returns ``(accept_conclusion, nudge)``:
+          - ``(True, None)`` — accept the LLM's choice, exit the loop. Default.
+          - ``(False, "...")`` — reject the bail, inject the nudge string as a
+            user message, continue the loop. ``MAX_INVESTIGATION_LOOPS`` still
+            caps the worst case so a stubborn model can't infinite-loop.
+
+        **Contract:** ``(False, None)`` is invalid and raises ``ValueError`` at
+        the call site. Rejecting the conclusion without providing a nudge
+        would spin the loop on an unchanged message history until the outer
+        iteration cap, silently burning the token budget. The type system
+        allows ``str | None`` so subclasses can use a single return type;
+        the runtime guard enforces the actual contract.
+
+        Default returns ``(True, None)`` — production agents accept whatever
+        the LLM decides. Subclasses can override to enforce minimum-evidence
+        floors, structured-stage progression, or other termination policies.
+        """
+        return True, None
+
     def run(
         self,
         state: dict[str, Any],
@@ -203,8 +230,28 @@ def _record_tool_end(tc: ToolCall, output: Any) -> None:
             messages.append(_build_assistant_msg(llm, response))
 
             if not response.has_tool_calls:
-                logger.debug("[agent] no tool calls — done after %d iterations", iteration + 1)
-                break
+                accept, nudge = self._should_accept_conclusion(
+                    evidence_count=len(evidence_entries),
+                    iteration=iteration,
+                )
+                if accept:
+                    logger.debug("[agent] no tool calls — done after %d iterations", iteration + 1)
+                    break
+                # Contract: rejecting the conclusion (accept=False) MUST
+                # come with a nudge so the next LLM call sees new context.
+                # Without one the loop would spin on an unchanged message
+                # history until MAX_INVESTIGATION_LOOPS, silently burning
+                # the entire token budget without making progress. Failing
+                # fast keeps buggy hook overrides loud rather than expensive.
+                if nudge is None:
+                    raise ValueError(
+                        f"{type(self).__name__}._should_accept_conclusion returned "
+                        "(False, None) — a nudge string is required when rejecting "
+                        "the conclusion, otherwise the LLM will loop on an unchanged "
+                        "message history until MAX_INVESTIGATION_LOOPS."
+                    )
+                messages.append({"role": "user", "content": nudge})
+                continue
 
             # Emit tool_start for each pending call before executing
             for tc in response.tool_calls:
@@ -347,8 +394,8 @@ def _enforce_context_budget(
 
     No-op on the happy path: the estimate covers messages + system + tools
     in one pass and returns under the ceiling for normal investigations.
-    Only fires on long CloudOpsBench cases where unbounded tool history
-    has pushed the prompt past the model's limit.
+    Only fires on long investigations where unbounded tool history has
+    pushed the prompt past the model's limit.
     """
     while _estimate_message_tokens(messages, system=system, tools=tools) > _TOKEN_BUDGET_CEILING:
         if not _trim_oldest_tool_pair(messages):
 
@@ -68,8 +68,8 @@ def classify_llm_invoke_failure(exc: BaseException) -> LLMInvokeFailure | None:
 
     Returns ``None`` to signal the caller should re-raise. In particular,
     :class:`LLMCreditExhaustedError` is intentionally NOT classified — it
-    represents a non-recoverable billing condition that the bench runner
-    (and production agent) must halt on, not wrap into a degraded result.
+    represents a non-recoverable billing condition that callers must halt
+    on, not wrap into a degraded result.
     """
     from app.integrations.llm_cli.errors import (
         CLIAuthenticationRequired,
 
@@ -4,10 +4,16 @@
 
 import logging
 from datetime import UTC, datetime
-from typing import Any, cast
+from typing import TYPE_CHECKING, Any, cast
 
 from app.state import AgentState
 
+if TYPE_CHECKING:
+    # Type-only import — avoids paying the agent module's heavy import cost
+    # at pipeline load while still letting static type-checkers validate
+    # ``agent_class`` injections.
+    from app.agent.investigation import ConnectedInvestigationAgent
+
 logger = logging.getLogger(__name__)
 
 
@@ -133,11 +139,20 @@ def log_query(query: str, window: dict[str, Any]) -> dict[str, Any]:
     return {"configurable": {"upstream_evidence_provider": provider}}
 
 
-def run_connected_investigation(state: AgentState) -> AgentState:
+def run_connected_investigation(
+    state: AgentState,
+    *,
+    agent_class: type[ConnectedInvestigationAgent] | None = None,
+) -> AgentState:
     """Resolve connected integrations → parse alert → agent loop → deliver.
 
     All steps mutate a shared state dict. Each step returns a dict of updates
     which are merged in. Pure function: inputs in, state out.
+
+    ``agent_class``: optional override for the investigation agent class.
+    Defaults to :class:`ConnectedInvestigationAgent`. Callers that need a
+    custom termination policy, structured-stage progression, or other
+    agent-level extensions can pass a subclass instead.
     """
     from app.agent.context import resolve_integrations
     from app.agent.extract import extract_alert
@@ -146,6 +161,7 @@ def run_connected_investigation(state: AgentState) -> AgentState:
     from app.delivery import deliver
     from app.utils.sentry_sdk import capture_exception
 
+    agent_class = agent_class or ConnectedInvestigationAgent
     state_any = cast(dict[str, Any], state)
 
     try:
@@ -155,7 +171,7 @@ def run_connected_investigation(state: AgentState) -> AgentState:
         if state_any.get("is_noise"):
             return cast(AgentState, state_any)
 
-        _merge(state_any, ConnectedInvestigationAgent().run(state_any))
+        _merge(state_any, agent_class().run(state_any))
         _merge(
             state_any,
             node_correlate_upstream(
 
@@ -9,14 +9,20 @@
 import threading
 from collections.abc import AsyncIterator, Callable
 from dataclasses import dataclass
-from typing import Any, cast
+from typing import TYPE_CHECKING, Any, cast
 
 from app.remote.stream import StreamEvent
 from app.state import AgentState, make_initial_state
 from app.types.config import NodeConfig
 from app.utils.errors import report_and_reraise
 from app.utils.sentry_sdk import init_sentry
 
+if TYPE_CHECKING:
+    # Type-only — avoids paying the agent module's heavy import cost at
+    # runner load while still letting static type-checkers validate
+    # ``agent_class`` injections.
+    from app.agent.investigation import ConnectedInvestigationAgent
+
 logger = logging.getLogger(__name__)
 
 # Serializes temporary render_report monkeypatches when multiple streaming
@@ -80,6 +86,7 @@ def run_investigation(
     openclaw_context: dict[str, Any] | None = None,
     opensre_evaluate: bool = False,
     investigation_metadata: tuple[str, str, str] | None = None,
+    agent_class: type[ConnectedInvestigationAgent] | None = None,
 ) -> AgentState:
     """Run the investigation from a raw alert payload. Pure function: inputs in, state out.
 
@@ -90,6 +97,10 @@ def run_investigation(
             FixtureGrafanaBackend should be injected without real credential resolution.
         investigation_metadata: Optional ``(alert_name, pipeline_name, severity)`` for
             initial state; avoids copying those fields onto ``raw_alert``.
+        agent_class: Optional override for the investigation agent class. Defaults
+            to ``ConnectedInvestigationAgent``. Callers that need a custom
+            termination policy, structured-stage progression, or other
+            agent-level extensions can pass a subclass instead.
     """
     init_sentry(entrypoint="pipeline")
     from app.pipeline.pipeline import run_connected_investigation as _run
@@ -109,7 +120,7 @@ def run_investigation(
         message="run_investigation failed",
         tags={"surface": "pipeline", "component": "app.pipeline.runners"},
     ):
-        return _run(initial)
+        return _run(initial, agent_class=agent_class)
 
 
 def run_chat(state: AgentState, _config: NodeConfig | None = None) -> AgentState:
 
@@ -56,9 +56,10 @@ def _rate_limit_sleep_seconds(err: BaseException, fallback_backoff: float) -> fl
     jitter ``Uniform(0, fallback_backoff)`` when no hint is available — same
     pattern AWS and the Anthropic/OpenAI SDKs use for transient errors.
 
-    Always adds ±10% jitter even on the server hint: with multiple bench
-    workers, four clients all sleeping for *exactly* the suggested 94ms
-    would still wake up in lockstep and re-trigger the same TPM bucket.
+    Always adds ±10% jitter even on the server hint: with multiple
+    concurrent callers, clients all sleeping for *exactly* the suggested
+    94ms would still wake up in lockstep and re-trigger the same TPM
+    bucket.
 
     Logs which branch produced the sleep duration so operators can audit
     whether the Retry-After path is actually firing (most OpenAI 429s
 
@@ -16,13 +16,6 @@
 
 
 def _eks_available(sources: dict[str, dict]) -> bool:
-    # In CloudOpsBench replay mode the EKS surface is served by the case
-    # snapshot via CloudOpsBenchK8sTools. Exposing the real EKS tools too
-    # would have the agent attempt sts:AssumeRole against placeholder ARNs
-    # like arn:aws:iam::placeholder:role/placeholder, which always fails.
-    backend = (sources.get("eks") or {}).get("_backend")
-    if getattr(backend, "is_cloudopsbench_backend", False):
-        return False
     return bool(sources.get("eks", {}).get("connection_verified"))
 
 
 
@@ -6,6 +6,7 @@
 import inspect
 import logging
 import pkgutil
+import threading
 from functools import lru_cache
 from types import ModuleType
 
@@ -25,6 +26,44 @@
     "utils",
 }
 
+# Extension point: callers outside ``app.tools.*`` (e.g. test suites,
+# external benchmark harnesses, downstream integrators) can register
+# additional tool packages by calling
+# :func:`register_external_tool_package`. Registered packages are walked
+# the same way as :mod:`app.tools` — each top-level submodule is imported
+# and any ``@tool``-decorated callables are picked up.
+#
+# Production stays clean: with no external registrations, the registry
+# discovers only ``app.tools.*``. The list is *not* persisted across
+# processes — every fresh import of opensre starts with zero externals.
+_external_tool_packages: list[ModuleType] = []
+_external_registration_lock = threading.Lock()
+
+
+def register_external_tool_package(package: ModuleType) -> None:
+    """Register an additional tool package for registry discovery.
+
+    Call before any ``get_registered_tools()`` consumer in the same
+    process. The registry cache is cleared so the new package's tools
+    appear on the next lookup.
+
+    Idempotent and thread-safe: concurrent callers registering the same
+    package (e.g. multiple workers in a ``ThreadPoolExecutor`` each
+    importing a bench package) won't add duplicate entries that would
+    otherwise produce noisy ``Duplicate tool name`` warnings on every
+    subsequent registry walk.
+
+    Production code does NOT call this — it's a hook for test suites
+    and external integrators that ship their own tools but want them
+    routed through opensre's agent loop.
+    """
+    with _external_registration_lock:
+        if package in _external_tool_packages:
+            return
+        _external_tool_packages.append(package)
+        clear_tool_registry_cache()
+
+
 # Preserve the current chat surface while the repo migrates toward explicit
 # per-tool surface metadata.
 _LEGACY_CHAT_TOOL_NAMES = {
@@ -46,9 +85,9 @@
 }
 
 
-def _iter_tool_module_names() -> list[str]:
+def _iter_tool_module_names(package: ModuleType) -> list[str]:
     module_names: list[str] = []
-    for module_info in pkgutil.iter_modules(tools_package.__path__):
+    for module_info in pkgutil.iter_modules(package.__path__):
         if module_info.name in _SKIP_MODULE_NAMES:
             continue
         if module_info.name.startswith("_") or module_info.name.endswith("_test"):
@@ -57,8 +96,8 @@ def _iter_tool_module_names() -> list[str]:
     return sorted(module_names)
 
 
-def _import_tool_module(module_name: str) -> ModuleType:
-    return importlib.import_module(f"{tools_package.__name__}.{module_name}")
+def _import_tool_module(package: ModuleType, module_name: str) -> ModuleType:
+    return importlib.import_module(f"{package.__name__}.{module_name}")
 
 
 def _candidate_belongs_to_module(candidate: object, module_name: str) -> bool:
@@ -122,29 +161,35 @@ def _collect_registered_tools_from_module(module: ModuleType) -> list[Registered
 def _load_registry_snapshot() -> tuple[RegisteredTool, ...]:
     tools_by_name: dict[str, RegisteredTool] = {}
 
-    for module_name in _iter_tool_module_names():
-        try:
-            module = _import_tool_module(module_name)
-        except ModuleNotFoundError as exc:
-            logger.warning("[tools] Skipping %s: %s", module_name, exc)
-            continue
-        except Exception as exc:
-            logger.warning(
-                "[tools] Skipping %s due to import failure: %s",
-                module_name,
-                exc,
-                exc_info=True,
-            )
-            continue
-
-        for tool in _collect_registered_tools_from_module(module):
-            if tool.name in tools_by_name:
+    # Walk the canonical tools package, then any externally-registered
+    # packages in the order they were registered. First definition of a
+    # given tool name wins; duplicates are logged and skipped.
+    packages: list[ModuleType] = [tools_package, *_external_tool_packages]
+    for package in packages:
+        for module_name in _iter_tool_module_names(package):
+            try:
+                module = _import_tool_module(package, module_name)
+            except ModuleNotFoundError as exc:
+                logger.warning("[tools] Skipping %s.%s: %s", package.__name__, module_name, exc)
+                continue
+            except Exception as exc:
                 logger.warning(
-                    "[tools] Duplicate tool name '%s' across modules; keeping first definition",
-                    tool.name,
+                    "[tools] Skipping %s.%s due to import failure: %s",
+                    package.__name__,
+                    module_name,
+                    exc,
+                    exc_info=True,
                 )
                 continue
-            tools_by_name[tool.name] = tool
+
+            for tool in _collect_registered_tools_from_module(module):
+                if tool.name in tools_by_name:
+                    logger.warning(
+                        "[tools] Duplicate tool name '%s' across modules; keeping first definition",
+                        tool.name,
+                    )
+                    continue
+                tools_by_name[tool.name] = tool
 
     return tuple(sorted(tools_by_name.values(), key=lambda tool: tool.name))
 
 
@@ -20,17 +20,14 @@ def eks_available_or_backend(sources: dict[str, dict]) -> bool:
     support continue to use the narrower check in
     ``app.tools.EKSListClustersTool._eks_available``.
 
-    Exception: in CloudOpsBench replay mode the EKS surface is served by the
-    case snapshot via CloudOpsBenchK8sTools (GetResources, GetClusterConfiguration,
-    etc.). The CloudOpsBenchReplayBackend does not implement the EKS tool API
-    (list_pods, get_pod_logs, ...), so exposing these EKS tools would have the
-    agent call methods that don't exist on the backend.
+    The ``_backend`` slot is reserved for fixture backends that implement
+    the EKS tool API (``list_pods``, ``get_pod_logs``, ...). Other backend
+    types that speak different protocols should be placed in their own
+    distinct source slots and are invisible to this check — the real EKS
+    tools stay deactivated for those modes.
     """
     eks = sources.get("eks", {})
-    backend = eks.get("_backend")
-    if getattr(backend, "is_cloudopsbench_backend", False):
-        return False
-    return bool(eks.get("connection_verified") or backend)
+    return bool(eks.get("connection_verified") or eks.get("_backend"))
 
 
 def datadog_available_or_backend(sources: dict[str, dict]) -> bool: