[WEB-4956] Add no_tools_found Sentry event to diagnose empty discovery scans (#215)

anonpran · audit · claude · web-flow · commit c6b102ac52eb · 2026-06-26T09:33:37.000+05:30
* Add no_tools_found Sentry event to diagnose empty discovery scans

A scan that finds zero tools was previously silent — indistinguishable from
an errored scan, a missed detection, or a genuinely empty device. Emit one
fire-and-forget Sentry warning (phase=no_tools_found) carrying the
discriminators needed to tell them apart:
- homes_enumerated: pre-fallback enumerated-user count (0 = enumeration miss;
  the existing post-fallback user_count masks 0 as 1)
- users_scanned, used_fallback_user, is_root, os, duration_ms

Counts/booleans only (no PII); reuses the existing curl-based report_to_sentry;
wrapped so telemetry can never raise into or slow the scan.

Co-Authored-By: Claude Opus 4.8 (1M context) &lt;noreply@anthropic.com&gt;

* Let no_tools_found bypass the per-run Sentry cap (Greptile P2)

A noisy empty scan (many earlier detector errors) could exhaust the shared
30-event per-run cap before the terminal no_tools_found summary runs, dropping
exactly the diagnostic you most need on a likely missed-detection. Add a
`priority` flag to report_to_sentry that bypasses the COUNT cap only — it still
honors the circuit breaker (a dead transport can't be helped) and dedup (no
spam) — and pass priority=True for the no_tools_found event.

Co-Authored-By: Claude Opus 4.8 (1M context) &lt;noreply@anthropic.com&gt;

* Let priority Sentry events bypass the circuit breaker too (Greptile P1)

The priority flag bypassed the count cap but the circuit breaker — tripped by 3
consecutive, possibly transient, send failures — could still skip the terminal
no_tools_found event without even an attempt. Allow a priority event ONE bounded
attempt past the breaker (at most one ~4s curl at end of run) so a transient
mid-scan outage doesn't drop the terminal diagnostic. Dedup is still honored, so
it can never spam.

Co-Authored-By: Claude Opus 4.8 (1M context) &lt;noreply@anthropic.com&gt;

---------

Co-authored-by: audit &lt;audit@local&gt;
Co-authored-by: Claude Opus 4.8 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/scripts/coding_discovery_tools/ai_tools_discovery.py b/scripts/coding_discovery_tools/ai_tools_discovery.py
@@ -2475,6 +2475,10 @@ def _on_term_signal(signum, _frame) -> None:
             else:
                 all_users = []
 
+            # Pre-fallback count is the key no-tools discriminator: 0 here means the
+            # macOS/AD enumeration missed every account (the fallback below masks 0->1).
+            homes_enumerated = len(all_users)
+
             # If no users found, fall back to current user
             if not all_users:
                 all_users = [get_user_info()]
@@ -2875,6 +2879,32 @@ def _on_term_signal(signum, _frame) -> None:
         except Exception as metrics_err:
             logger.debug(f"Building/sending discovery metrics failed: {metrics_err}")
 
+        # No-tools telemetry: a zero-tool scan is otherwise silent — we cannot tell
+        # an enumeration miss from a timing race from a genuinely empty device.
+        # Emit ONE enriched warning with the discriminators. Fire-and-forget:
+        # telemetry must never raise into / slow the scan.
+        if not tools:
+            try:
+                no_tools_ctx = {
+                    **sentry_ctx,
+                    "phase": "no_tools_found",
+                    "homes_enumerated": homes_enumerated,
+                    "users_scanned": len(all_users),
+                    "used_fallback_user": homes_enumerated == 0,
+                    "os": platform.system(),
+                    "duration_ms": round((time.monotonic() - t_start) * 1000),
+                }
+                if hasattr(os, "getuid"):
+                    no_tools_ctx["is_root"] = os.getuid() == 0
+                report_to_sentry(
+                    RuntimeError("Discovery found no tools"),
+                    context=no_tools_ctx,
+                    level="warning",
+                    priority=True,
+                )
+            except Exception:
+                pass
+
         # Send scan completed event AFTER all scanning
         logger.info("Sending scan completed event...")
         success, _ = send_scan_event(
diff --git a/scripts/coding_discovery_tools/utils.py b/scripts/coding_discovery_tools/utils.py
@@ -1611,13 +1611,20 @@ def report_to_sentry(
     exception: Exception,
     context: Optional[Dict] = None,
     level: str = "error",
+    priority: bool = False,
 ) -> None:
     """Send an event to Sentry using the raw HTTP store endpoint.
 
     Args:
         exception: The exception to report.
         context: Extra tags/context (e.g. phase, tool_name, http_code).
         level: Sentry level -- "error" for crashes, "warning" for HTTP send failures.
+        priority: Best-effort guarantee a terminal once-per-run diagnostic
+            (e.g. the no_tools_found summary) is delivered. Bypasses both the
+            per-run event cap AND the circuit breaker so earlier transient
+            per-tool send failures can't silently skip it -- it still gets ONE
+            attempt at the end of the run (bounded: at most one ~4s curl). Dedup
+            is still honored (no spam). Reserve for a single terminal event/run.
     """
     try:
         dsn = _parse_sentry_dsn(_SENTRY_DSN)
@@ -1630,11 +1637,17 @@ def report_to_sentry(
         global _sentry_event_count, _sentry_consecutive_fails, _sentry_dead_this_run
         # Circuit breaker: once the transport looks dead, stop calling Sentry for the
         # rest of the run so a blocked endpoint can't add its timeout to every failure.
-        if _sentry_dead_this_run:
+        # priority events bypass it for ONE bounded attempt so a transient mid-scan
+        # outage doesn't silently drop the terminal diagnostic.
+        if _sentry_dead_this_run and not priority:
             return
         # Collapse duplicate events and hard-cap the synchronous curls per run.
+        # priority events skip the count cap + breaker (but never dedup) so a
+        # terminal once-per-run diagnostic isn't starved by earlier per-tool errors.
         signature = (type(exception).__name__, ctx.get("phase"), ctx.get("tool_name"))
-        if signature in _sentry_sent_signatures or _sentry_event_count >= _SENTRY_MAX_EVENTS_PER_RUN:
+        if signature in _sentry_sent_signatures:
+            return
+        if not priority and _sentry_event_count >= _SENTRY_MAX_EVENTS_PER_RUN:
             return
         _sentry_sent_signatures.add(signature)
         _sentry_event_count += 1
diff --git a/tests/test_discovery_flow.py b/tests/test_discovery_flow.py
@@ -1351,5 +1351,93 @@ def test_extract_failure_reports_warning_and_returns_empty(self):
         self.assertEqual(context.get("phase"), "extract")
 
 
+class TestNoToolsSentryEvent(unittest.TestCase):
+    """main() emits exactly one enriched 'no_tools_found' warning on a zero-tool
+    scan (the only signal that distinguishes an enumeration miss from a genuinely
+    empty device) and stays silent when any tool is detected."""
+
+    @staticmethod
+    def _context_of(call):
+        # report_to_sentry is invoked two ways in the codebase: context as the
+        # keyword `context=` (the no-tools path) and context positionally as the
+        # 2nd arg (extraction/process paths). Read whichever is present.
+        args, kwargs = call
+        if "context" in kwargs:
+            return kwargs["context"]
+        return args[1] if len(args) > 1 else {}
+
+    def _run_main(self, detect_return, mock_sentry):
+        import scripts.coding_discovery_tools.ai_tools_discovery as adm
+
+        argv = ["ai_tools_discovery.py", "--api-key", "k", "--domain", "http://127.0.0.1:1"]
+        detector_inst = Mock()
+        detector_inst.get_device_id.return_value = "dev"
+        detector_inst.detect_all_tools.return_value = detect_return
+        with patch.object(adm.platform, "system", return_value="Linux"), \
+             patch.object(adm.discovery_cache, "acquire_lock", return_value="acquired"), \
+             patch.object(adm.discovery_cache, "heartbeat_start", Mock(return_value=threading.Event())), \
+             patch.object(adm.discovery_cache, "release_lock", Mock()), \
+             patch.object(adm, "AIToolsDetector", return_value=detector_inst), \
+             patch.object(adm, "report_to_sentry", mock_sentry), \
+             patch.object(adm, "send_scan_event", Mock(return_value=(True, None))), \
+             patch.object(adm, "send_discovery_metrics", Mock()), \
+             patch.object(adm, "load_pending_reports", return_value=[]), \
+             patch.object(adm, "save_failed_reports", Mock()), \
+             patch.object(adm, "get_all_users_linux", return_value=[]), \
+             patch.object(adm, "get_user_info", return_value="runner"), \
+             patch.object(utils_mod, "_SENTRY_DSN", ""), \
+             patch.object(sys, "argv", argv):
+            detector_inst._set_canonical_vscode_copilot = Mock()
+            try:
+                adm.main()
+            except SystemExit:
+                pass
+
+    @unittest.skipUnless(os.name == "posix", "POSIX signal handling")
+    def test_fires_on_zero_tool_scan(self):
+        mock_sentry = Mock()
+        self._run_main([], mock_sentry)
+
+        no_tools_calls = [
+            c for c in mock_sentry.call_args_list
+            if self._context_of(c).get("phase") == "no_tools_found"
+        ]
+        self.assertEqual(len(no_tools_calls), 1, "expected exactly one no_tools_found event")
+
+        call = no_tools_calls[0]
+        args, kwargs = call
+        # Exception is positional, RuntimeError with the fixed message.
+        self.assertIsInstance(args[0], RuntimeError)
+        self.assertEqual(str(args[0]), "Discovery found no tools")
+        self.assertEqual(kwargs.get("level"), "warning")
+
+        ctx = self._context_of(call)
+        # get_all_users_linux -> [] means enumeration missed every account, so the
+        # current-user fallback supplies the single scanned home.
+        self.assertEqual(ctx.get("homes_enumerated"), 0)
+        self.assertEqual(ctx.get("users_scanned"), 1)
+        self.assertIs(ctx.get("used_fallback_user"), True)
+        self.assertIn("device_id", ctx)
+        self.assertIn("run_id", ctx)
+        self.assertIn("duration_ms", ctx)
+        # PII guard: no list-valued context (e.g. the raw user list) may leak.
+        for key, value in ctx.items():
+            self.assertNotIsInstance(value, list, f"context key {key!r} is a list")
+
+    @unittest.skipUnless(os.name == "posix", "POSIX signal handling")
+    def test_does_not_fire_when_tools_found(self):
+        mock_sentry = Mock()
+        self._run_main(
+            [{"name": "Claude Code", "version": "1.0", "install_path": "/home/runner/.claude"}],
+            mock_sentry,
+        )
+
+        no_tools_calls = [
+            c for c in mock_sentry.call_args_list
+            if self._context_of(c).get("phase") == "no_tools_found"
+        ]
+        self.assertEqual(no_tools_calls, [], "no_tools_found must not fire when a tool is detected")
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/test_send_and_persist.py b/tests/test_send_and_persist.py
@@ -15,7 +15,7 @@
 from datetime import datetime, timezone, timedelta
 from http.server import HTTPServer, BaseHTTPRequestHandler
 from pathlib import Path
-from unittest.mock import patch
+from unittest.mock import patch, Mock
 
 import scripts.coding_discovery_tools.utils as utils_mod
 from scripts.coding_discovery_tools.utils import (
@@ -491,5 +491,72 @@ def test_scan_failed_device_level_error(self, _sleep):
         self.assertEqual(payload["scan_error"]["error_type"], "RuntimeError")
 
 
+class TestSentryPriorityBypassesCap(unittest.TestCase):
+    """A priority=True event (the terminal no_tools_found summary) bypasses the
+    per-run event cap, but still respects the circuit breaker and dedup."""
+
+    def setUp(self):
+        self._reset_budget()
+
+    def tearDown(self):
+        self._reset_budget()
+
+    @staticmethod
+    def _reset_budget():
+        utils_mod._sentry_event_count = 0
+        utils_mod._sentry_sent_signatures = set()
+        utils_mod._sentry_consecutive_fails = 0
+        utils_mod._sentry_dead_this_run = False
+
+    @patch.object(utils_mod, "subprocess")
+    @patch.object(
+        utils_mod, "_parse_sentry_dsn",
+        return_value={"key": "k", "store_url": "http://sentry.invalid/store/"},
+    )
+    def test_priority_event_bypasses_count_cap(self, _dsn, mock_subprocess):
+        mock_subprocess.run.return_value = Mock(returncode=0, stdout="200", stderr="")
+        utils_mod._sentry_event_count = utils_mod._SENTRY_MAX_EVENTS_PER_RUN
+
+        # Non-priority event with a fresh signature is dropped: cap reached.
+        utils_mod.report_to_sentry(
+            RuntimeError("capped"), {"phase": "detect", "tool_name": "X"}
+        )
+        self.assertEqual(mock_subprocess.run.call_count, 0)
+
+        # The terminal priority event still sends despite the exhausted budget.
+        utils_mod.report_to_sentry(
+            RuntimeError("Discovery found no tools"),
+            {"phase": "no_tools_found"},
+            level="warning",
+            priority=True,
+        )
+        self.assertEqual(mock_subprocess.run.call_count, 1)
+
+    @patch.object(utils_mod, "subprocess")
+    @patch.object(
+        utils_mod, "_parse_sentry_dsn",
+        return_value={"key": "k", "store_url": "http://sentry.invalid/store/"},
+    )
+    def test_priority_bypasses_breaker_but_respects_dedup(self, _dsn, mock_subprocess):
+        mock_subprocess.run.return_value = Mock(returncode=0, stdout="200", stderr="")
+
+        # Breaker open from earlier (possibly transient) failures => a priority event
+        # STILL gets its one bounded attempt, so the terminal diagnostic isn't lost.
+        utils_mod._sentry_dead_this_run = True
+        utils_mod.report_to_sentry(
+            RuntimeError("Discovery found no tools"),
+            {"phase": "no_tools_found"}, priority=True,
+        )
+        self.assertEqual(mock_subprocess.run.call_count, 1)
+
+        # Dedup is still honored even with priority: the same signature (added by the
+        # send above) is not re-sent, so a priority event can never spam.
+        utils_mod.report_to_sentry(
+            RuntimeError("Discovery found no tools"),
+            {"phase": "no_tools_found"}, priority=True,
+        )
+        self.assertEqual(mock_subprocess.run.call_count, 1)
+
+
 if __name__ == "__main__":
     unittest.main()