Skip to content

Commit 750c422

Browse files
fix(layer3): clean remote-resource URLs + configurable timeout & byte-size guards (#52)
* fix(layer3): produce clean resource IDs for remote HTTP/SSE URLs Layer 3 resource discovery composed IDs as `${kind}:${url}`, which for http/sse kinds collided with the URL's own scheme and produced malformed values like `http:https://mcp.linear.app/mcp`. These IDs leaked into `rule_id` and `file_path` on Layer 3 findings. Introduce `url-validation.ts` with `normalizeRemoteUrl` (reject non http/https schemes, missing hosts, canonicalise trailing slashes) and `buildResourceId` (use the URL itself for http/sse kinds, preserve the `<kind>:<locator>` shape for npm/pypi/git). Wire both into the scan resource collector so IDs are clean and URLs validated before becoming findings. * fix(layer3): add configurable timeout and byte-size guard for remote fetches A slow or malicious remote host could previously hang a scan or flood it with gigabytes of data through a Layer 3 resource probe. Harden the remote fetcher with two new limits: * `layer3_remote_fetch_timeout_ms` (default 5000) — per-attempt timeout enforced via AbortController. Overridable by `CODEGATE_LAYER3_REMOTE_FETCH_TIMEOUT_MS`. * `layer3_remote_fetch_max_bytes` (default 1_048_576) — rejects declared Content-Length above the cap immediately, and aborts the streaming read once bytes exceed it (so servers that lie about / omit the header cannot bypass the guard). Overridable by `CODEGATE_LAYER3_REMOTE_FETCH_MAX_BYTES`. Both limits are exposed only via config file and env var — no CLI flag. The fetcher exports `resourceFetcherOptionsFromConfig` as a small helper for callers plumbing the limits from `CodeGateConfig`.
1 parent 360cbcd commit 750c422

17 files changed

Lines changed: 655 additions & 55 deletions

src/config.ts

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,22 @@ export interface CodeGateConfig {
7676
workflow_audits?: WorkflowAuditConfig;
7777
suppress_findings: string[];
7878
suppression_rules?: SuppressionRule[];
79+
/**
80+
* Timeout (in milliseconds) applied to Layer 3 remote resource fetches
81+
* (npm/PyPI registry lookups, git ls-remote, and any http/sse MCP probes).
82+
* Kept deliberately low so a slow or deliberately stalling host cannot
83+
* hang a scan. Overridable via `CODEGATE_LAYER3_REMOTE_FETCH_TIMEOUT_MS`.
84+
*/
85+
layer3_remote_fetch_timeout_ms: number;
86+
/**
87+
* Maximum response size (in bytes) accepted from a Layer 3 remote fetch.
88+
* A declared `Content-Length` above this value is rejected immediately,
89+
* and the streaming reader aborts once the running byte count exceeds
90+
* this limit (defends against servers that lie about or omit
91+
* `Content-Length`). Overridable via
92+
* `CODEGATE_LAYER3_REMOTE_FETCH_MAX_BYTES`.
93+
*/
94+
layer3_remote_fetch_max_bytes: number;
7995
}
8096

8197
interface PartialTuiConfig {
@@ -122,6 +138,8 @@ interface PartialCodeGateConfig {
122138
};
123139
suppress_findings?: string[];
124140
suppression_rules?: SuppressionRule[];
141+
layer3_remote_fetch_timeout_ms?: number;
142+
layer3_remote_fetch_max_bytes?: number;
125143
}
126144

127145
export interface CliConfigOverrides {
@@ -175,8 +193,36 @@ export const DEFAULT_CONFIG: CodeGateConfig = {
175193
workflow_audits: { enabled: false },
176194
suppress_findings: [],
177195
suppression_rules: [],
196+
layer3_remote_fetch_timeout_ms: 5000,
197+
layer3_remote_fetch_max_bytes: 1_048_576,
178198
};
179199

200+
/** Env var name that overrides `layer3_remote_fetch_timeout_ms`. */
201+
export const LAYER3_REMOTE_FETCH_TIMEOUT_ENV = "CODEGATE_LAYER3_REMOTE_FETCH_TIMEOUT_MS";
202+
/** Env var name that overrides `layer3_remote_fetch_max_bytes`. */
203+
export const LAYER3_REMOTE_FETCH_MAX_BYTES_ENV = "CODEGATE_LAYER3_REMOTE_FETCH_MAX_BYTES";
204+
205+
function normalizePositiveInteger(value: unknown): number | undefined {
206+
if (typeof value === "number" && Number.isFinite(value) && value > 0) {
207+
return Math.floor(value);
208+
}
209+
if (typeof value === "string" && value.trim().length > 0) {
210+
const parsed = Number(value.trim());
211+
if (Number.isFinite(parsed) && parsed > 0) {
212+
return Math.floor(parsed);
213+
}
214+
}
215+
return undefined;
216+
}
217+
218+
function readEnvOverride(name: string): number | undefined {
219+
const raw = process.env[name];
220+
if (raw === undefined) {
221+
return undefined;
222+
}
223+
return normalizePositiveInteger(raw);
224+
}
225+
180226
interface PartialRulePolicyConfig {
181227
disable?: boolean;
182228
ignore?: string[];
@@ -627,6 +673,20 @@ export function resolveEffectiveConfig(options: ResolveConfigOptions): CodeGateC
627673
...(globalConfig.suppression_rules ?? []),
628674
...(projectConfig.suppression_rules ?? []),
629675
],
676+
layer3_remote_fetch_timeout_ms:
677+
pickFirst(
678+
readEnvOverride(LAYER3_REMOTE_FETCH_TIMEOUT_ENV),
679+
normalizePositiveInteger(projectConfig.layer3_remote_fetch_timeout_ms),
680+
normalizePositiveInteger(globalConfig.layer3_remote_fetch_timeout_ms),
681+
DEFAULT_CONFIG.layer3_remote_fetch_timeout_ms,
682+
) ?? DEFAULT_CONFIG.layer3_remote_fetch_timeout_ms,
683+
layer3_remote_fetch_max_bytes:
684+
pickFirst(
685+
readEnvOverride(LAYER3_REMOTE_FETCH_MAX_BYTES_ENV),
686+
normalizePositiveInteger(projectConfig.layer3_remote_fetch_max_bytes),
687+
normalizePositiveInteger(globalConfig.layer3_remote_fetch_max_bytes),
688+
DEFAULT_CONFIG.layer3_remote_fetch_max_bytes,
689+
) ?? DEFAULT_CONFIG.layer3_remote_fetch_max_bytes,
630690
};
631691
}
632692

src/layer3-dynamic/resource-fetcher.ts

Lines changed: 119 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,29 @@ export interface ResourceRequest {
1111
export interface ResourceFetcherOptions {
1212
maxRetries?: number;
1313
timeoutMs?: number;
14+
/**
15+
* Maximum number of bytes accepted in the response body. Enforced against
16+
* both the declared `Content-Length` header (if present) and the running
17+
* byte count during streaming read. Defaults to 1 MiB.
18+
*/
19+
maxBytes?: number;
20+
}
21+
22+
export const DEFAULT_FETCH_TIMEOUT_MS = 5000;
23+
export const DEFAULT_FETCH_MAX_BYTES = 1_048_576;
24+
25+
/**
26+
* Extract Layer 3 remote-fetch limits from the resolved CodeGate config.
27+
* Kept here so callers don't have to remember the config field names.
28+
*/
29+
export function resourceFetcherOptionsFromConfig(config: {
30+
layer3_remote_fetch_timeout_ms: number;
31+
layer3_remote_fetch_max_bytes: number;
32+
}): ResourceFetcherOptions {
33+
return {
34+
timeoutMs: config.layer3_remote_fetch_timeout_ms,
35+
maxBytes: config.layer3_remote_fetch_max_bytes,
36+
};
1437
}
1538

1639
export interface ResourceFetcherDeps {
@@ -58,12 +81,80 @@ function endpointFor(request: ResourceRequest): string {
5881
return request.locator;
5982
}
6083

61-
async function parseResponse(response: Response): Promise<unknown> {
84+
/**
85+
* Read a response body while enforcing `maxBytes`. Returns the collected
86+
* string, or throws a tagged error if the declared `Content-Length` or the
87+
* streamed size exceeds the cap.
88+
*/
89+
async function readBodyWithLimit(response: Response, maxBytes: number): Promise<string> {
90+
const declared = response.headers.get("content-length");
91+
if (declared !== null) {
92+
const parsed = Number(declared);
93+
if (Number.isFinite(parsed) && parsed > maxBytes) {
94+
// Drain & release the stream without reading bytes.
95+
try {
96+
await response.body?.cancel();
97+
} catch {
98+
// no-op: cancel failures are non-fatal.
99+
}
100+
throw new Error(
101+
`response_too_large: declared Content-Length ${parsed} exceeds limit ${maxBytes}`,
102+
);
103+
}
104+
}
105+
106+
const body = response.body;
107+
if (!body) {
108+
// No stream (e.g., HEAD or empty body): fall back to text().
109+
const text = await response.text();
110+
if (Buffer.byteLength(text, "utf8") > maxBytes) {
111+
throw new Error(`response_too_large: body ${Buffer.byteLength(text, "utf8")} > ${maxBytes}`);
112+
}
113+
return text;
114+
}
115+
116+
const reader = body.getReader();
117+
const chunks: Uint8Array[] = [];
118+
let total = 0;
119+
try {
120+
while (true) {
121+
const { done, value } = await reader.read();
122+
if (done) {
123+
break;
124+
}
125+
if (!value) {
126+
continue;
127+
}
128+
total += value.byteLength;
129+
if (total > maxBytes) {
130+
try {
131+
await reader.cancel();
132+
} catch {
133+
// no-op
134+
}
135+
throw new Error(`response_too_large: streamed ${total} > ${maxBytes}`);
136+
}
137+
chunks.push(value);
138+
}
139+
} finally {
140+
try {
141+
reader.releaseLock();
142+
} catch {
143+
// releaseLock throws if the reader was already cancelled; ignore.
144+
}
145+
}
146+
147+
const buffer = Buffer.concat(chunks.map((chunk) => Buffer.from(chunk)));
148+
return buffer.toString("utf8");
149+
}
150+
151+
async function parseResponse(response: Response, maxBytes: number): Promise<unknown> {
62152
const contentType = response.headers.get("content-type") ?? "";
153+
const text = await readBodyWithLimit(response, maxBytes);
63154
if (contentType.includes("application/json")) {
64-
return (await response.json()) as unknown;
155+
return JSON.parse(text) as unknown;
65156
}
66-
return await response.text();
157+
return text;
67158
}
68159

69160
function timeoutError(error: unknown): boolean {
@@ -72,6 +163,11 @@ function timeoutError(error: unknown): boolean {
72163
return message.includes("timeout") || message.includes("aborted");
73164
}
74165

166+
function isResponseTooLarge(error: unknown): boolean {
167+
const message = error instanceof Error ? error.message : String(error);
168+
return message.startsWith("response_too_large");
169+
}
170+
75171
export async function fetchResourceMetadata(
76172
request: ResourceRequest,
77173
customDeps: ResourceFetcherDeps = defaultDeps(),
@@ -104,14 +200,19 @@ export async function fetchResourceMetadata(
104200
}
105201

106202
const endpoint = endpointFor(request);
107-
const timeoutMs = options.timeoutMs ?? 5000;
203+
const timeoutMs = options.timeoutMs ?? DEFAULT_FETCH_TIMEOUT_MS;
204+
const maxBytes = options.maxBytes ?? DEFAULT_FETCH_MAX_BYTES;
108205

109206
for (let attempt = 0; attempt <= maxRetries; attempt += 1) {
110207
try {
111208
const controller = new AbortController();
112209
const timer = setTimeout(() => controller.abort(), timeoutMs);
113-
const response = await deps.fetch(endpoint, { signal: controller.signal });
114-
clearTimeout(timer);
210+
let response: Response;
211+
try {
212+
response = await deps.fetch(endpoint, { signal: controller.signal });
213+
} finally {
214+
clearTimeout(timer);
215+
}
115216

116217
if (response.status === 401 || response.status === 403) {
117218
return {
@@ -135,13 +236,24 @@ export async function fetchResourceMetadata(
135236
};
136237
}
137238

239+
const metadata = await parseResponse(response, maxBytes);
138240
return {
139241
status: "ok",
140242
attempts: attempt + 1,
141243
elapsedMs: deps.now() - startedAt,
142-
metadata: await parseResponse(response),
244+
metadata,
143245
};
144246
} catch (error) {
247+
// Size-limit breaches are deterministic — do not retry, surface as network_error.
248+
if (isResponseTooLarge(error)) {
249+
return {
250+
status: "network_error",
251+
attempts: attempt + 1,
252+
elapsedMs: deps.now() - startedAt,
253+
error: error instanceof Error ? error.message : String(error),
254+
};
255+
}
256+
145257
if (attempt < maxRetries) {
146258
await deps.sleep(100 * (attempt + 1));
147259
continue;

src/layer3-dynamic/tool-description-acquisition.ts

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import {
22
fetchResourceMetadata,
33
type ResourceFetchResult,
4+
type ResourceFetcherOptions,
45
type ResourceKind,
56
type ResourceRequest,
67
} from "./resource-fetcher.js";
@@ -37,9 +38,16 @@ export interface ToolDescriptionAcquisitionDeps {
3738
fetchMetadata: (request: ResourceRequest) => Promise<ResourceFetchResult>;
3839
}
3940

40-
function defaultDeps(): ToolDescriptionAcquisitionDeps {
41+
export interface ToolDescriptionAcquisitionOptions {
42+
fetchOptions?: ResourceFetcherOptions;
43+
}
44+
45+
function defaultDeps(
46+
options: ToolDescriptionAcquisitionOptions = {},
47+
): ToolDescriptionAcquisitionDeps {
4148
return {
42-
fetchMetadata: async (request) => fetchResourceMetadata(request),
49+
fetchMetadata: async (request) =>
50+
fetchResourceMetadata(request, undefined, options.fetchOptions),
4351
};
4452
}
4553

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
/**
2+
* URL validation and normalisation helpers for Layer 3 remote resource handling.
3+
*
4+
* These helpers guarantee that remote resources (HTTP/SSE MCP endpoints,
5+
* skill-referenced URLs) use a safe, canonical form before they are fed into
6+
* finding `rule_id` / `file_path` fields or into the fetcher.
7+
*
8+
* Historically, L3 resource IDs were composed as `${kind}:${url}` which, for
9+
* http/sse kinds, produced malformed values like `http:https://mcp.linear.app/mcp`
10+
* (the kind collides with the URL's own scheme). `buildResourceId` avoids that
11+
* double-scheme shape by reusing the URL itself as the id for http/sse kinds.
12+
*/
13+
export type RemoteScheme = "http" | "https";
14+
15+
export interface NormalizeRemoteUrlResult {
16+
ok: true;
17+
url: string;
18+
scheme: RemoteScheme;
19+
}
20+
21+
export interface NormalizeRemoteUrlError {
22+
ok: false;
23+
reason: "empty" | "unsupported_scheme" | "missing_host" | "missing_scheme" | "invalid_url";
24+
}
25+
26+
/**
27+
* Validate and canonicalise a remote URL. Rejects non http/https schemes,
28+
* missing hosts, and malformed inputs. Normalises a bare-host path to a
29+
* single trailing slash and strips trailing slashes from longer paths.
30+
*/
31+
export function normalizeRemoteUrl(
32+
input: string,
33+
): NormalizeRemoteUrlResult | NormalizeRemoteUrlError {
34+
if (typeof input !== "string" || input.trim().length === 0) {
35+
return { ok: false, reason: "empty" };
36+
}
37+
38+
const trimmed = input.trim();
39+
40+
// Quick reject for bare `http:` / `https:` without `//` and host.
41+
if (/^https?:\/?$/iu.test(trimmed)) {
42+
return { ok: false, reason: "missing_host" };
43+
}
44+
45+
// Must start with http:// or https:// (case-insensitive).
46+
if (!/^https?:\/\//iu.test(trimmed)) {
47+
return { ok: false, reason: "missing_scheme" };
48+
}
49+
50+
let parsed: URL;
51+
try {
52+
parsed = new URL(trimmed);
53+
} catch {
54+
return { ok: false, reason: "invalid_url" };
55+
}
56+
57+
const scheme = parsed.protocol.replace(":", "").toLowerCase();
58+
if (scheme !== "http" && scheme !== "https") {
59+
return { ok: false, reason: "unsupported_scheme" };
60+
}
61+
62+
if (parsed.hostname.length === 0) {
63+
return { ok: false, reason: "missing_host" };
64+
}
65+
66+
// Normalise trailing slashes: keep `/` for root paths, strip for others.
67+
if (parsed.pathname.length > 1 && parsed.pathname.endsWith("/")) {
68+
parsed.pathname = parsed.pathname.replace(/\/+$/u, "");
69+
}
70+
71+
return {
72+
ok: true,
73+
url: parsed.toString(),
74+
scheme: scheme as RemoteScheme,
75+
};
76+
}
77+
78+
export type DeepScanResourceKind = "npm" | "pypi" | "git" | "http" | "sse";
79+
80+
/**
81+
* Build a canonical resource id used for findings (`rule_id`, `file_path`) and
82+
* for consent prompts. For http/sse kinds the id is the URL itself (no
83+
* `http:` / `sse:` prefix) to avoid the malformed `http:https://...` shape.
84+
* For npm/pypi/git, the `<kind>:<locator>` prefix is preserved because those
85+
* locators are not URLs and other code (e.g. `isRegistryMetadataResource`)
86+
* keys on that prefix.
87+
*/
88+
export function buildResourceId(kind: DeepScanResourceKind, locator: string): string {
89+
if (kind === "http" || kind === "sse") {
90+
return locator;
91+
}
92+
return `${kind}:${locator}`;
93+
}

0 commit comments

Comments
 (0)