Skip to content

Commit 8eb7e09

Browse files
authored
fix(sanitize): switch sanitizeUrlProtocols to allowlist for ://-scheme URLs (#31715)
1 parent 4f4cc8d commit 8eb7e09

2 files changed

Lines changed: 112 additions & 28 deletions

File tree

actions/setup/js/sanitize_content.test.cjs

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -956,6 +956,73 @@ describe("sanitize_content.cjs", () => {
956956
expect(result).toContain("(redacted)");
957957
expect(result).not.toContain("javascript%25253A");
958958
});
959+
960+
it("should redact ws:// URLs (WebSocket)", () => {
961+
const result = sanitizeContent("Connect to ws://evil.com/socket");
962+
expect(result).toContain("(evil.com/redacted)");
963+
expect(result).not.toContain("ws://");
964+
});
965+
966+
it("should redact wss:// URLs (secure WebSocket)", () => {
967+
const result = sanitizeContent("Connect to wss://evil.com/socket");
968+
expect(result).toContain("(evil.com/redacted)");
969+
expect(result).not.toContain("wss://");
970+
});
971+
972+
it("should redact smb:// URLs (SMB shares)", () => {
973+
const result = sanitizeContent("[share](smb://attacker.com/share)");
974+
expect(result).toContain("(attacker.com/redacted)");
975+
expect(result).not.toContain("smb://");
976+
});
977+
978+
it("should redact irc:// URLs", () => {
979+
const result = sanitizeContent("Join irc://irc.libera.chat/#channel");
980+
expect(result).toContain("(irc.libera.chat/redacted)");
981+
expect(result).not.toContain("irc://");
982+
});
983+
984+
it("should redact ldap:// URLs", () => {
985+
const result = sanitizeContent("[x](ldap://evil.com/cn=admin)");
986+
expect(result).toContain("(evil.com/redacted)");
987+
expect(result).not.toContain("ldap://");
988+
});
989+
990+
it("should redact ldaps:// URLs", () => {
991+
const result = sanitizeContent("[x](ldaps://evil.com/cn=admin)");
992+
expect(result).toContain("(evil.com/redacted)");
993+
expect(result).not.toContain("ldaps://");
994+
});
995+
996+
it("should redact rtsp:// URLs", () => {
997+
const result = sanitizeContent("Stream rtsp://attacker.com/stream");
998+
expect(result).toContain("(attacker.com/redacted)");
999+
expect(result).not.toContain("rtsp://");
1000+
});
1001+
1002+
it("should redact magnet: URLs", () => {
1003+
const result = sanitizeContent("[torrent](magnet:?xt=urn:btih:abc123)");
1004+
expect(result).toContain("(redacted)");
1005+
expect(result).not.toContain("magnet:");
1006+
});
1007+
1008+
it("should not redact https:// URLs in protocol sanitization step", () => {
1009+
const result = sanitizeContent("Visit https://github.com/repo");
1010+
expect(result).toBe("Visit https://github.com/repo");
1011+
});
1012+
1013+
it("should not corrupt https:// URLs by matching a suffix of the scheme", () => {
1014+
// Regression: the new allowlist regex must not match "ttps://" as a protocol
1015+
// within "https://github.com" and redact github.com.
1016+
const result = sanitizeContent("See https://github.com/org/repo for details");
1017+
expect(result).toBe("See https://github.com/org/repo for details");
1018+
});
1019+
1020+
it("should redact protocol:// URLs with no host (empty domain)", () => {
1021+
// e.g. file:///etc/passwd — domain capture group is empty
1022+
const result = sanitizeContent("file:///etc/passwd");
1023+
expect(result).toContain("(redacted)");
1024+
expect(result).not.toContain("file://");
1025+
});
9591026
});
9601027

9611028
describe("URL domain filtering", () => {

actions/setup/js/sanitize_content_core.cjs

Lines changed: 45 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -176,12 +176,12 @@ function sanitizeDomainName(domain) {
176176
* @returns {string} The string with non-https protocols redacted
177177
*/
178178
function sanitizeUrlProtocols(s) {
179-
// Normalize percent-encoded colons before applying the protocol blocklist.
179+
// Normalize percent-encoded colons before applying the protocol filter.
180180
// This prevents bypasses via javascript%3Aalert(1) (single-encoded),
181181
// javascript%253Aalert(1) (double-encoded), or deeper nesting.
182182
// Strategy: iteratively decode %25 -> % (up to 4 passes, which handles
183183
// encodings up to 5 levels deep) until stable, then decode %3A -> :
184-
// so the blocklist regex always sees literal colons.
184+
// so the filter regex always sees literal colons.
185185
let normalized = s;
186186
// Iteratively decode %25XX (percent-encoded percent signs) one level at a
187187
// time. 4 passes handles up to 5 encoding levels, which is far beyond the
@@ -197,35 +197,52 @@ function sanitizeUrlProtocols(s) {
197197
}
198198
normalized = normalized.replace(/%3[Aa]/gi, ":"); // decode %3A -> :
199199

200-
// Match common non-https protocols
201-
// This regex matches: protocol://domain or protocol:path or incomplete protocol://
202-
// Examples: http://, ftp://, file://, data:, javascript:, mailto:, tel:, ssh://, git://
203-
// The regex also matches incomplete protocols like "http://" or "ftp://" without a domain
204-
// Note: No word boundary check to catch protocols even when preceded by word characters
205-
return normalized.replace(/((?:http|ftp|file|ssh|git):\/\/([\w.-]*)(?:[^\s]*)|(?:data|javascript|vbscript|about|mailto|tel):[^\s]+)/gi, (match, _fullMatch, domain) => {
206-
// Extract domain for http/ftp/file/ssh/git protocols
207-
if (domain) {
208-
const domainLower = domain.toLowerCase();
209-
const sanitized = sanitizeDomainName(domainLower);
210-
const truncated = domainLower.length > 12 ? domainLower.substring(0, 12) + "..." : domainLower;
200+
// ── Step 1: allowlist-based protocol:// filtering ──────────────────────────
201+
// Redact every scheme:// URL that is NOT https://. This covers http://,
202+
// ftp://, ssh://, git://, ws://, wss://, smb://, irc://, ldap://, ldaps://,
203+
// rtsp://, feed://, and any future schemes — eliminating the class of
204+
// blocklist-incompleteness bypasses.
205+
//
206+
// Regex anchors that protect existing https:// URLs:
207+
// (?<![a-z0-9]) — negative lookbehind: ensures we do not match a suffix of
208+
// another protocol name (e.g. "ttps://" inside "https://…").
209+
// (?!https://) — negative lookahead: explicitly excludes https://, which is
210+
// passed through to sanitizeUrlDomains for domain filtering.
211+
let result = normalized.replace(/(?<![a-z0-9])(?!https:\/\/)([a-z][a-z0-9+.-]*)(:\/\/)([\w.-]*)([^\s]*)/gi, (_match, scheme, _slashes, domain, _rest) => {
212+
const fullMatch = _match;
213+
if (!domain) {
214+
// No host present (e.g. "file:///path" or bare "http://"). Use the scheme
215+
// (e.g. "file://") as the redacted-domain token so the redaction summary
216+
// remains useful without recording an empty-string entry.
217+
const truncated = fullMatch.length > 12 ? fullMatch.substring(0, 12) + "..." : fullMatch;
211218
core.info(`Redacted URL: ${truncated}`);
212-
core.debug(`Redacted URL (full): ${match}`);
213-
addRedactedDomain(domainLower);
214-
// Return sanitized domain format
215-
return sanitized ? `(${sanitized}/redacted)` : "(redacted)";
216-
} else {
217-
// For other protocols (data:, javascript:, etc.), track the protocol itself
218-
const protocolMatch = match.match(/^([^:]+):/);
219-
if (protocolMatch) {
220-
const protocol = protocolMatch[1] + ":";
221-
// Truncate the matched URL for logging (keep first 12 chars + "...")
222-
const truncated = match.length > 12 ? match.substring(0, 12) + "..." : match;
223-
core.info(`Redacted URL: ${truncated}`);
224-
core.debug(`Redacted URL (full): ${match}`);
225-
addRedactedDomain(protocol);
226-
}
219+
core.debug(`Redacted URL (full): ${fullMatch}`);
220+
addRedactedDomain(scheme.toLowerCase() + "://");
227221
return "(redacted)";
228222
}
223+
const domainLower = domain.toLowerCase();
224+
const sanitized = sanitizeDomainName(domainLower);
225+
const truncated = domainLower.length > 12 ? domainLower.substring(0, 12) + "..." : domainLower;
226+
core.info(`Redacted URL: ${truncated}`);
227+
core.debug(`Redacted URL (full): ${fullMatch}`);
228+
addRedactedDomain(domainLower);
229+
return sanitized ? `(${sanitized}/redacted)` : "(redacted)";
230+
});
231+
232+
// ── Step 2: blocklist-based single-colon scheme filtering ───────────────────
233+
// For schemes that do not use "//", a targeted blocklist is used because a
234+
// fully general single-colon pattern produces too many false positives
235+
// (e.g. "key:value" in YAML, "std::vector" in C++, "C:\path" on Windows).
236+
return result.replace(/(?:data|javascript|vbscript|about|mailto|tel|magnet):[^\s]+/gi, match => {
237+
const protocolMatch = match.match(/^([^:]+):/);
238+
if (protocolMatch) {
239+
const protocol = protocolMatch[1] + ":";
240+
const truncated = match.length > 12 ? match.substring(0, 12) + "..." : match;
241+
core.info(`Redacted URL: ${truncated}`);
242+
core.debug(`Redacted URL (full): ${match}`);
243+
addRedactedDomain(protocol);
244+
}
245+
return "(redacted)";
229246
});
230247
}
231248

0 commit comments

Comments
 (0)