codegate/src/layer3-dynamic/url-validation.ts at 846398905d0b6a9280d5470d155ef7db357a8e99 · jonathansantilli/codegate · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
/**
 * URL validation and normalisation helpers for Layer 3 remote resource handling.
 *
 * These helpers guarantee that remote resources (HTTP/SSE MCP endpoints,
 * skill-referenced URLs) use a safe, canonical form before they are fed into
 * finding `rule_id` / `file_path` fields or into the fetcher.
 *
 * Historically, L3 resource IDs were composed as `${kind}:${url}` which, for
 * http/sse kinds, produced malformed values like `http:https://mcp.linear.app/mcp`
 * (the kind collides with the URL's own scheme). `buildResourceId` avoids that
 * double-scheme shape by reusing the URL itself as the id for http/sse kinds.
 */
export type RemoteScheme = "http" | "https";

export interface NormalizeRemoteUrlResult {
  ok: true;
  url: string;
  scheme: RemoteScheme;
}

export interface NormalizeRemoteUrlError {
  ok: false;
  reason: "empty" | "unsupported_scheme" | "missing_host" | "missing_scheme" | "invalid_url";
}

/**
 * Validate and canonicalise a remote URL. Rejects non http/https schemes,
 * missing hosts, and malformed inputs. Normalises a bare-host path to a
 * single trailing slash and strips trailing slashes from longer paths.
 */
export function normalizeRemoteUrl(
  input: string,
): NormalizeRemoteUrlResult | NormalizeRemoteUrlError {
  if (typeof input !== "string" || input.trim().length === 0) {
    return { ok: false, reason: "empty" };
  }

  const trimmed = input.trim();

  // Quick reject for bare `http:` / `https:` without `//` and host.
  if (/^https?:\/?$/iu.test(trimmed)) {
    return { ok: false, reason: "missing_host" };
  }

  // Must start with http:// or https:// (case-insensitive).
  if (!/^https?:\/\//iu.test(trimmed)) {
    return { ok: false, reason: "missing_scheme" };
  }

  let parsed: URL;
  try {
    parsed = new URL(trimmed);
  } catch {
    return { ok: false, reason: "invalid_url" };
  }

  const scheme = parsed.protocol.replace(":", "").toLowerCase();
  if (scheme !== "http" && scheme !== "https") {
    return { ok: false, reason: "unsupported_scheme" };
  }

  if (parsed.hostname.length === 0) {
    return { ok: false, reason: "missing_host" };
  }

  // Normalise trailing slashes: keep `/` for root paths, strip for others.
  if (parsed.pathname.length > 1 && parsed.pathname.endsWith("/")) {
    parsed.pathname = parsed.pathname.replace(/\/+$/u, "");
  }

  return {
    ok: true,
    url: parsed.toString(),
    scheme: scheme as RemoteScheme,
  };
}

export type DeepScanResourceKind = "npm" | "pypi" | "git" | "http" | "sse";

/**
 * Build a canonical resource id used for findings (`rule_id`, `file_path`) and
 * for consent prompts. For http/sse kinds the id is the URL itself (no
 * `http:` / `sse:` prefix) to avoid the malformed `http:https://...` shape.
 * For npm/pypi/git, the `<kind>:<locator>` prefix is preserved because those
 * locators are not URLs and other code (e.g. `isRegistryMetadataResource`)
 * keys on that prefix.
 */
export function buildResourceId(kind: DeepScanResourceKind, locator: string): string {
  if (kind === "http" || kind === "sse") {
    return locator;
  }
  return `${kind}:${locator}`;
}