fix(scan): stop attributing host-wide findings to per-target scans

jonathansantilli · jonathansantilli · commit b7871a4cac62 · 2026-04-22T13:41:29.000+01:00
A targeted `scan <target>` currently surfaces two classes of finding whose `file_path` is outside the scan target. When the Mobb Agent Security dashboard groups scans by target, those host-wide findings make every skill look suspicious even though only one is. This fixes both leaks. Layer 2 — hidden-unicode (and every other rule-file detector): when the scan target is itself inside the user's home directory (for example a single skill at `~/.codex/skills/<name>`), user-scope wildcard patterns like `~/.agents/skills/*/SKILL.md` were walking the whole home tree and picking up sibling skills. Add a `shouldKeepUserScopeCandidate` guard that drops user-scope matches outside the scan target in exactly this case; project-root scans outside the home directory keep the existing host-wide context behavior unchanged. Layer 3 — `layer3OutcomesToFindings`: a successful outcome with no `metadata.findings[]` / `metadata.tools[]` was emitting a LOW `layer3-network_error` PARSE_ERROR finding whose `file_path` was the remote MCP URL (e.g. `https://mcp.linear.app/mcp`). The default resource executor deliberately makes no outbound calls, so this "schema mismatch" was the norm, not an anomaly, and the resulting finding polluted every per-target scan report with host-level noise. Treat all resource kinds the same way registry resources were already handled: silently skip when no actionable metadata is present. Tests added: - `tests/layer2/cross-scan-attribution.test.ts` — sibling skill inside the same home is no longer attributed to the scanned skill, but is reported when the scan target is their parent; project scans outside the home still see user-scope files. - `tests/layer3/network-error-suppression.test.ts` — HTTP/SSE schema mismatches produce no findings while timeout / auth_failure / command_error / skipped_without_consent still do. `tests/layer3/layer3-integration.test.ts` is updated to match: the case that asserted the old schema-mismatch finding now asserts its absence.
diff --git a/src/pipeline.ts b/src/pipeline.ts
@@ -314,12 +314,6 @@ function layer3ErrorFinding(
   });
 }
 
-function isRegistryMetadataResource(resourceId: string): boolean {
-  return (
-    resourceId.startsWith("npm:") || resourceId.startsWith("pypi:") || resourceId.startsWith("git:")
-  );
-}
-
 export function layer3OutcomesToFindings(
   outcomes: DeepScanOutcome[],
   options: { unicodeAnalysis?: boolean } = {},
@@ -353,17 +347,17 @@ export function layer3OutcomesToFindings(
     const derived = deriveLayer3ToolFindings(outcome.resourceId, outcome.result.metadata, options);
     const combined = [...parsed, ...derived];
 
+    // If a Layer 3 resource was fetched successfully but carries no
+    // actionable metadata (no `findings[]`, no `tools[]`), that is not an
+    // issue with the scan target itself — it usually means the default
+    // no-outbound-call resource executor recorded only a URL stub, or that
+    // a host-configured MCP endpoint simply returned an unrecognised
+    // payload. Previously we emitted a LOW `layer3-network_error`
+    // "schema mismatch" finding whose `file_path` was the remote URL,
+    // which leaked host-level noise into every per-target scan report.
+    // Fetch-level anomalies that are unrelated to the scan target are now
+    // dropped silently for all resource kinds.
     if (combined.length === 0) {
-      if (isRegistryMetadataResource(outcome.resourceId)) {
-        continue;
-      }
-      findings.push(
-        layer3ErrorFinding(
-          outcome.resourceId,
-          "network_error",
-          "Deep scan response schema mismatch: expected metadata.findings[] or metadata.tools[]",
-        ),
-      );
       continue;
     }
     findings.push(...combined);
diff --git a/src/scan.ts b/src/scan.ts
@@ -232,6 +232,53 @@ function isRegularFile(path: string): boolean {
   }
 }
 
+/** True when `candidatePath` resolves at or below `root`. */
+function isPathInside(root: string, candidatePath: string): boolean {
+  const resolvedCandidate = resolve(candidatePath);
+  const resolvedRoot = resolve(root);
+  if (resolvedCandidate === resolvedRoot) {
+    return true;
+  }
+  const rel = relative(resolvedRoot, resolvedCandidate);
+  if (rel === "" || rel === ".") {
+    return true;
+  }
+  if (rel.startsWith("..")) {
+    return false;
+  }
+  // On Windows, relative() may return an absolute path across drives.
+  if (rel.includes(":")) {
+    return false;
+  }
+  return true;
+}
+
+/**
+ * Decide whether a user-scope candidate at `candidatePath` should be attached
+ * to a scan of `scanTarget` rooted at `homeDir`.
+ *
+ * User-scope patterns (e.g. `~/.agents/skills/&ast;/SKILL.md`) walk the whole
+ * home directory, so they can match files belonging to completely unrelated
+ * skills or agents. When the scan target is itself a specific location
+ * **inside** the user's home — e.g. scanning a single skill directory — any
+ * user-scope match outside that scan target belongs to a different scan and
+ * must not be attributed here.
+ *
+ * When the scan target lives outside the home directory (for example a
+ * project root in a workspace), user-scope matches are accepted as legitimate
+ * host-wide context for that scan.
+ */
+function shouldKeepUserScopeCandidate(
+  scanTarget: string,
+  homeDir: string,
+  candidatePath: string,
+): boolean {
+  if (isPathInside(homeDir, scanTarget)) {
+    return isPathInside(scanTarget, candidatePath);
+  }
+  return true;
+}
+
 function toUserReportPath(pattern: string): string {
   const normalized = normalizeUserScopePattern(pattern);
   return `~/${normalized}`;
@@ -411,6 +458,14 @@ function collectSelectedCandidates(
       const userPattern = normalizeUserScopePattern(candidate.pattern);
       if (userPattern.includes("*")) {
         for (const match of collectUserScopeWildcardMatches(options.homeDir, userPattern)) {
+          // A scan whose target itself lives under the user's home directory
+          // (e.g. a single skill at `~/.codex/skills/foo`) must only report
+          // findings about files inside that target. User-scope wildcards
+          // walk the whole home tree, so they can match sibling skills or
+          // other agents that belong to different scans; drop those here.
+          if (!shouldKeepUserScopeCandidate(absoluteTarget, options.homeDir, match.absolutePath)) {
+            continue;
+          }
           const reportPath = toUserReportPath(match.relativePath);
           if (!matchesCollectionKinds(reportPath, options.collectKinds)) {
             continue;
@@ -430,6 +485,9 @@ function collectSelectedCandidates(
       if (!existsSync(absolutePath) || !isRegularFile(absolutePath)) {
         continue;
       }
+      if (!shouldKeepUserScopeCandidate(absoluteTarget, options.homeDir, absolutePath)) {
+        continue;
+      }
       const reportPath = toUserReportPath(userPattern);
       if (!matchesCollectionKinds(reportPath, options.collectKinds)) {
         continue;
diff --git a/tests/layer2/cross-scan-attribution.test.ts b/tests/layer2/cross-scan-attribution.test.ts
@@ -0,0 +1,158 @@
+import { mkdtempSync, mkdirSync, writeFileSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+import { describe, expect, it } from "vitest";
+import type { CodeGateConfig } from "../../src/config";
+import { runScanEngine } from "../../src/scan";
+
+/**
+ * These tests pin down the fix for the cross-scan finding-attribution bug.
+ *
+ * User-scope patterns (e.g. `~/.agents/skills/&ast;/SKILL.md`) walk the user's
+ * entire home directory. When the scan target itself is a specific path
+ * inside the home directory — for example a single skill at
+ * `~/.codex/skills/<name>` — wildcard matches for sibling skills belong to
+ * completely different scans and must not be attributed to the current scan.
+ *
+ * The scenarios below simulate scanning one skill while a sibling skill that
+ * contains hidden Unicode also exists on the host, and assert that only the
+ * in-target finding is reported.
+ */
+
+const BASE_CONFIG: CodeGateConfig = {
+  severity_threshold: "high",
+  auto_proceed_below_threshold: true,
+  output_format: "terminal",
+  scan_state_path: "/tmp/codegate-cross-scan-state.json",
+  tui: { enabled: false, colour_scheme: "default", compact_mode: false },
+  tool_discovery: { preferred_agent: "claude", agent_paths: {}, skip_tools: [] },
+  trusted_directories: [],
+  blocked_commands: ["bash", "sh", "curl", "wget", "nc", "python", "node"],
+  known_safe_mcp_servers: [],
+  known_safe_formatters: [],
+  known_safe_lsp_servers: [],
+  known_safe_hooks: [],
+  unicode_analysis: true,
+  check_ide_settings: true,
+  owasp_mapping: true,
+  trusted_api_domains: [],
+  strict_collection: false,
+  scan_collection_modes: ["default"],
+  persona: "regular",
+  runtime_mode: "offline",
+  workflow_audits: { enabled: false },
+  suppress_findings: [],
+  scan_user_scope: true,
+};
+
+describe("cross-scan attribution — Layer 2 hidden-unicode rule", () => {
+  it("does not flag sibling user-scope skills when the scan target is inside the home directory", async () => {
+    const home = mkdtempSync(join(tmpdir(), "codegate-cross-scan-home-"));
+
+    // Target skill: the scan target itself lives under the fake home dir,
+    // exactly like `~/.codex/skills/<name>` in the bug report.
+    mkdirSync(join(home, ".agents", "skills", "foo"), { recursive: true });
+    writeFileSync(
+      join(home, ".agents", "skills", "foo", "SKILL.md"),
+      "Clean skill body.\n",
+      "utf8",
+    );
+
+    // Sibling skill with hidden Unicode under the same user-scope wildcard.
+    // Under the old behavior this file would surface as a cross-scan finding
+    // attributed to the `foo` scan.
+    mkdirSync(join(home, ".agents", "skills", "bar"), { recursive: true });
+    writeFileSync(
+      join(home, ".agents", "skills", "bar", "SKILL.md"),
+      "Sibling skill​ with a hidden zero-width space.\n",
+      "utf8",
+    );
+
+    const scanTarget = join(home, ".agents", "skills", "foo");
+
+    const report = await runScanEngine({
+      version: "0.1.0",
+      scanTarget,
+      config: BASE_CONFIG,
+      homeDir: home,
+    });
+
+    const hiddenUnicodeFindings = report.findings.filter(
+      (finding) => finding.rule_id === "rule-file-hidden-unicode",
+    );
+
+    // No finding should reference the sibling skill.
+    expect(hiddenUnicodeFindings.some((finding) => finding.file_path.includes("skills/bar"))).toBe(
+      false,
+    );
+
+    // No finding in the whole report should reference the sibling skill.
+    expect(report.findings.some((finding) => finding.file_path.includes("skills/bar"))).toBe(false);
+  });
+
+  it("still flags hidden Unicode in a sibling skill when the scan target *is* the parent of both", async () => {
+    // Sanity check: restricting sibling attribution must not hide in-target
+    // findings. When the user scans the parent directory, both skills are
+    // inside the scan target and both must be reported.
+    const home = mkdtempSync(join(tmpdir(), "codegate-cross-scan-parent-home-"));
+    mkdirSync(join(home, ".agents", "skills", "foo"), { recursive: true });
+    writeFileSync(
+      join(home, ".agents", "skills", "foo", "SKILL.md"),
+      "Clean skill body.\n",
+      "utf8",
+    );
+    mkdirSync(join(home, ".agents", "skills", "bar"), { recursive: true });
+    writeFileSync(
+      join(home, ".agents", "skills", "bar", "SKILL.md"),
+      "Sibling skill​ with a hidden zero-width space.\n",
+      "utf8",
+    );
+
+    const scanTarget = join(home, ".agents", "skills");
+
+    const report = await runScanEngine({
+      version: "0.1.0",
+      scanTarget,
+      config: BASE_CONFIG,
+      homeDir: home,
+    });
+
+    expect(
+      report.findings.some(
+        (finding) =>
+          finding.rule_id === "rule-file-hidden-unicode" &&
+          finding.file_path.includes("bar/SKILL.md"),
+      ),
+    ).toBe(true);
+  });
+
+  it("preserves user-scope attribution when the scan target is not inside the home directory", async () => {
+    // When scanning an arbitrary project root (outside the user's home),
+    // user-scope files remain legitimate host-wide context and should still
+    // be included as before.
+    const home = mkdtempSync(join(tmpdir(), "codegate-cross-scan-external-home-"));
+    const projectRoot = mkdtempSync(join(tmpdir(), "codegate-cross-scan-project-"));
+
+    mkdirSync(join(home, ".agents", "skills", "bar"), { recursive: true });
+    writeFileSync(
+      join(home, ".agents", "skills", "bar", "SKILL.md"),
+      "Sibling skill​ with a hidden zero-width space.\n",
+      "utf8",
+    );
+
+    const report = await runScanEngine({
+      version: "0.1.0",
+      scanTarget: projectRoot,
+      config: BASE_CONFIG,
+      homeDir: home,
+    });
+
+    expect(
+      report.findings.some(
+        (finding) =>
+          finding.rule_id === "rule-file-hidden-unicode" &&
+          finding.file_path === "~/.agents/skills/bar/SKILL.md",
+      ),
+    ).toBe(true);
+  });
+});
diff --git a/tests/layer3/layer3-integration.test.ts b/tests/layer3/layer3-integration.test.ts
@@ -8,7 +8,7 @@ import { createEmptyReport } from "../../src/types/report";
 import { planRemediation } from "../../src/layer4-remediation/remediator";
 
 describe("task 28 layer3 integration", () => {
-  it("emits parse-style findings for consent refusal, timeout, and schema mismatch", () => {
+  it("emits parse-style findings for consent refusal and timeout, but not for schema mismatch", () => {
     const outcomes: DeepScanOutcome[] = [
       {
         resourceId: "npm:@org/a",
@@ -39,11 +39,15 @@ describe("task 28 layer3 integration", () => {
       },
     ];
 
+    // Schema-mismatch outcomes are no longer surfaced as findings: they
+    // describe a host-configured endpoint's behavior, not an issue with
+    // the scan target, and previously caused `layer3-network_error`
+    // findings to appear on every per-target scan report.
     const findings = layer3OutcomesToFindings(outcomes);
-    expect(findings).toHaveLength(3);
+    expect(findings).toHaveLength(2);
     expect(findings[0]?.finding_id).toContain("skipped_without_consent");
     expect(findings[1]?.severity).toBe("MEDIUM");
-    expect(findings[2]?.description).toContain("schema mismatch");
+    expect(findings.some((finding) => finding.rule_id === "layer3-network_error")).toBe(false);
   });
 
   it("merges valid layer3 findings into report summary and supports source-config remediation", () => {
diff --git a/tests/layer3/network-error-suppression.test.ts b/tests/layer3/network-error-suppression.test.ts