Skip to content
63 changes: 59 additions & 4 deletions scripts/coding_discovery_tools/ai_tools_discovery.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,7 @@ def get_device_id(self) -> str:
"""
return self._device_id_extractor.extract_device_id()

def detect_all_tools(self, user_home: Optional[Path] = None) -> List[Dict]:
def detect_all_tools(self, user_home: Optional[Path] = None, failures: Optional[set] = None) -> List[Dict]:
"""
Detect all supported AI tools.

Expand Down Expand Up @@ -401,6 +401,9 @@ def detect_all_tools(self, user_home: Optional[Path] = None) -> List[Dict]:
except Exception as e:
logger.warning(f"Error detecting {detector.tool_name}: {e}")
report_to_sentry(e, {"phase": "detect", "tool_name": detector.tool_name}, level="warning")
# Detection errored: record the tool so the caller can keep it (presence unknown != uninstalled).
if failures is not None:
failures.add(detector.tool_name)

return tools

Expand Down Expand Up @@ -2802,6 +2805,11 @@ def _on_term_signal(signum, _frame) -> None:
# Track failed reports for persistence
failed_reports = []

# (home_user, tool_name) detected present this run; backend set-diffs it in "completed" to prune the rest.
scanned_manifest = set()
# Detection/extraction errors this run; if non-empty the scan is marked unclean so the backend skips pruning.
incomplete_reasons = []

# --- Drain pending reports from previous run ---
with time_step("drain_pending_queue", "queue"):
pending = load_pending_reports()
Expand Down Expand Up @@ -2896,7 +2904,20 @@ def _on_term_signal(signum, _frame) -> None:
user_home = Path.home()
logger.info(f" Detecting tools for user: {user} (home: {user_home})")
with time_step("detect_tools", "detect"):
user_tools = detector.detect_all_tools(user_home=user_home)
user_detect_failures = set()
user_tools = detector.detect_all_tools(
user_home=user_home, failures=user_detect_failures
)
# Record per-user presence at detection: a tool this user actually has stays in the
# manifest even if reading/uploading it later errors (a read failure isn't an
# uninstall), and only users who detected the tool get an entry (no phantom ownership).
for detected in user_tools:
scanned_manifest.add((user, detected.get('name', 'Unknown')))

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Phantom ownership remains

This records the current outer-loop user for every tool returned by the detector, but some extension detectors can return tools from other home directories during root scans. For example, if Alice has Roo Code or Cline and Bob does not, the detector can still return Alice's extension while the loop is scanning Bob. That adds (Bob, tool_name) to the manifest, lets the later manifest guard pass, and can send Bob a phantom install with no per-user projects. The backend then treats Bob's row as still installed and cannot prune it. The manifest entry needs to come from the detected tool's actual owner, or these detector families need an ownership check before the pair is added.

# A detector ERRORED -> presence unknown. detector.tool_name is an umbrella label
# (e.g. "GitHub Copilot"), not the concrete row name ("GitHub Copilot (VS Code)"),
# so it can't safely target the manifest. Skip pruning this run instead.
if user_detect_failures:
incomplete_reasons.append(f"detector error for user {user}")

if user_tools:
logger.info(f" Found {len(user_tools)} tool(s) for {user}:")
Expand Down Expand Up @@ -2959,6 +2980,13 @@ def _on_term_signal(signum, _frame) -> None:
user_home = Path.home()

try:
# Only report a tool for users who actually detected it (the manifest is
# the per-user presence set). all_tools is deduped globally, so without this
# a user-scoped tool one user has would otherwise be reported for every
# enumerated user — a phantom install the backend could never prune.
if (user_name, tool_name) not in scanned_manifest:
continue

# Filter projects to only include this user's projects
with time_step("filter_projects", "process"):
tool_filtered = detector.filter_tool_projects_by_user(tool_with_projects, user_home)
Expand All @@ -2974,6 +3002,8 @@ def _on_term_signal(signum, _frame) -> None:
f"{tool_filtered.get('_config_path') or tool_filtered.get('install_path')!r} "
f"not owned by this user and no per-user data"
)
# Detected globally but not owned by this user -> drop the presence entry.
scanned_manifest.discard((user_name, tool_name))
continue

# Ownership gate (Augment surfaces): same ~/.augment-keyed
Expand All @@ -2986,6 +3016,8 @@ def _on_term_signal(signum, _frame) -> None:
f"{tool_filtered.get('_config_path') or tool_filtered.get('install_path')!r} "
f"not owned by this user and no per-user data"
)
# Detected globally but not owned by this user -> drop the presence entry.
scanned_manifest.discard((user_name, tool_name))
continue

# Detect subscription plan for Claude Code
Expand Down Expand Up @@ -3197,6 +3229,8 @@ def _on_term_signal(signum, _frame) -> None:

except Exception as e:
logger.error(f"Error processing tool {tool_name}: {e}", exc_info=True)
# Detected tools are already in the manifest from the detection phase, so a
# device-wide extraction failure here cannot drop a live tool (no re-add needed).
report_to_sentry(e, {**sentry_ctx, "phase": "process_tool", "tool_name": tool_name}, level="warning")
logger.info("")

Expand Down Expand Up @@ -3233,6 +3267,8 @@ def _on_term_signal(signum, _frame) -> None:
"os": platform.system(),
"tool_count": len(tools),
"user_count": len(all_users),
"manifest_size": len(scanned_manifest),
"scan_incomplete": bool(incomplete_reasons),
"python_version": f"{sys.version_info.major}.{sys.version_info.minor}",
"script_version": SCRIPT_VERSION,
},
Expand All @@ -3244,11 +3280,30 @@ def _on_term_signal(signum, _frame) -> None:
except Exception as metrics_err:
logger.debug(f"Building/sending discovery metrics failed: {metrics_err}")

# Send scan completed event AFTER all scanning
# Detection/extraction hit an error this run -> mark the scan unclean BEFORE the
# completed event so the backend's reconcile skips pruning (a missing tool may mean
# "couldn't read", not "uninstalled"). Sent first so scan_error is persisted before
# the completed event dispatches the reconcile.
if incomplete_reasons:
send_scan_event(
args.domain, args.api_key, device_id, run_id, "failed",
args.app_name,
scan_error={
"error_type": "ScanIncomplete",
"message": "; ".join(incomplete_reasons[:20]),
"timestamp": datetime.utcnow().isoformat() + "Z",
},
sentry_context=sentry_ctx, system_user=system_user,
)

# only the completed event carries manifest + covered users (backend prunes from them)
logger.info("Sending scan completed event...")
# manifest = (home_user, tool_name) detected present; backend set-diffs it to prune the rest.
manifest = [{"home_user": hu, "tool_name": tn} for hu, tn in sorted(scanned_manifest)]
success, _ = send_scan_event(
args.domain, args.api_key, device_id, run_id, "completed",
args.app_name, sentry_context=sentry_ctx, system_user=system_user
args.app_name, sentry_context=sentry_ctx, system_user=system_user,
manifest=manifest, covered_home_users=all_users,

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Incomplete scan still sends prune manifest

Medium Severity

When incomplete_reasons is non-empty, the client sends a failed event with ScanIncomplete and then always sends completed with manifest and covered_home_users. The failed send’s success is ignored, and the completed payload carries no incompleteness flag. If the failed event never reaches the backend, reconcile may run on a partial manifest and soft-delete tools that were skipped because of detector errors.

Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit 83a19b3. Configure here.

)
if success:
logger.info("✓ Scan completed event sent successfully")
Expand Down
12 changes: 12 additions & 0 deletions scripts/coding_discovery_tools/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -716,6 +716,8 @@ def send_scan_event(
scan_error: Optional[Dict] = None,
sentry_context: Optional[Dict] = None,
system_user: Optional[str] = None,
manifest: Optional[List[Dict]] = None,
covered_home_users: Optional[List[str]] = None,
) -> Tuple[bool, bool]:
"""
Send scan lifecycle event to backend (in_progress, completed, failed).
Expand All @@ -733,6 +735,10 @@ def send_scan_event(
system_user: Optional real human user running the scan (or None). Used by
the backend to attribute empty machines. MUST be a real human or
None (see ``get_audit_user``), never a junk/service identity.
manifest: Optional [{"home_user", "tool_name"}] seen this run; sent only on
"completed" so the backend set-diffs it to prune the rest.
covered_home_users: Optional home users covered; sent only on "completed" to
bound the prune scope.

Returns:
Tuple of (success, retryable): success=True if sent, retryable=True if caller should queue
Expand All @@ -755,6 +761,12 @@ def send_scan_event(
if scan_error:
payload["scan_error"] = scan_error

if manifest is not None:
payload["manifest"] = manifest

if covered_home_users is not None:
payload["covered_home_users"] = covered_home_users

return send_report_to_backend(
backend_url,
api_key,
Expand Down
Loading
Loading