@@ -176,12 +176,12 @@ function sanitizeDomainName(domain) {
176176 * @returns {string } The string with non-https protocols redacted
177177 */
178178function sanitizeUrlProtocols ( s ) {
179- // Normalize percent-encoded colons before applying the protocol blocklist .
179+ // Normalize percent-encoded colons before applying the protocol filter .
180180 // This prevents bypasses via javascript%3Aalert(1) (single-encoded),
181181 // javascript%253Aalert(1) (double-encoded), or deeper nesting.
182182 // Strategy: iteratively decode %25 -> % (up to 4 passes, which handles
183183 // encodings up to 5 levels deep) until stable, then decode %3A -> :
184- // so the blocklist regex always sees literal colons.
184+ // so the filter regex always sees literal colons.
185185 let normalized = s ;
186186 // Iteratively decode %25XX (percent-encoded percent signs) one level at a
187187 // time. 4 passes handles up to 5 encoding levels, which is far beyond the
@@ -197,35 +197,52 @@ function sanitizeUrlProtocols(s) {
197197 }
198198 normalized = normalized . replace ( / % 3 [ A a ] / gi, ":" ) ; // decode %3A -> :
199199
200- // Match common non-https protocols
201- // This regex matches: protocol://domain or protocol:path or incomplete protocol://
202- // Examples: http://, ftp://, file://, data:, javascript:, mailto:, tel:, ssh://, git://
203- // The regex also matches incomplete protocols like "http://" or "ftp://" without a domain
204- // Note: No word boundary check to catch protocols even when preceded by word characters
205- return normalized . replace ( / ( (?: h t t p | f t p | f i l e | s s h | g i t ) : \/ \/ ( [ \w . - ] * ) (?: [ ^ \s ] * ) | (?: d a t a | j a v a s c r i p t | v b s c r i p t | a b o u t | m a i l t o | t e l ) : [ ^ \s ] + ) / gi, ( match , _fullMatch , domain ) => {
206- // Extract domain for http/ftp/file/ssh/git protocols
207- if ( domain ) {
208- const domainLower = domain . toLowerCase ( ) ;
209- const sanitized = sanitizeDomainName ( domainLower ) ;
210- const truncated = domainLower . length > 12 ? domainLower . substring ( 0 , 12 ) + "..." : domainLower ;
200+ // ── Step 1: allowlist-based protocol:// filtering ──────────────────────────
201+ // Redact every scheme:// URL that is NOT https://. This covers http://,
202+ // ftp://, ssh://, git://, ws://, wss://, smb://, irc://, ldap://, ldaps://,
203+ // rtsp://, feed://, and any future schemes — eliminating the class of
204+ // blocklist-incompleteness bypasses.
205+ //
206+ // Regex anchors that protect existing https:// URLs:
207+ // (?<![a-z0-9]) — negative lookbehind: ensures we do not match a suffix of
208+ // another protocol name (e.g. "ttps://" inside "https://…").
209+ // (?!https://) — negative lookahead: explicitly excludes https://, which is
210+ // passed through to sanitizeUrlDomains for domain filtering.
211+ let result = normalized . replace ( / (?< ! [ a - z 0 - 9 ] ) (? ! h t t p s : \/ \/ ) ( [ a - z ] [ a - z 0 - 9 + . - ] * ) ( : \/ \/ ) ( [ \w . - ] * ) ( [ ^ \s ] * ) / gi, ( _match , scheme , _slashes , domain , _rest ) => {
212+ const fullMatch = _match ;
213+ if ( ! domain ) {
214+ // No host present (e.g. "file:///path" or bare "http://"). Use the scheme
215+ // (e.g. "file://") as the redacted-domain token so the redaction summary
216+ // remains useful without recording an empty-string entry.
217+ const truncated = fullMatch . length > 12 ? fullMatch . substring ( 0 , 12 ) + "..." : fullMatch ;
211218 core . info ( `Redacted URL: ${ truncated } ` ) ;
212- core . debug ( `Redacted URL (full): ${ match } ` ) ;
213- addRedactedDomain ( domainLower ) ;
214- // Return sanitized domain format
215- return sanitized ? `(${ sanitized } /redacted)` : "(redacted)" ;
216- } else {
217- // For other protocols (data:, javascript:, etc.), track the protocol itself
218- const protocolMatch = match . match ( / ^ ( [ ^ : ] + ) : / ) ;
219- if ( protocolMatch ) {
220- const protocol = protocolMatch [ 1 ] + ":" ;
221- // Truncate the matched URL for logging (keep first 12 chars + "...")
222- const truncated = match . length > 12 ? match . substring ( 0 , 12 ) + "..." : match ;
223- core . info ( `Redacted URL: ${ truncated } ` ) ;
224- core . debug ( `Redacted URL (full): ${ match } ` ) ;
225- addRedactedDomain ( protocol ) ;
226- }
219+ core . debug ( `Redacted URL (full): ${ fullMatch } ` ) ;
220+ addRedactedDomain ( scheme . toLowerCase ( ) + "://" ) ;
227221 return "(redacted)" ;
228222 }
223+ const domainLower = domain . toLowerCase ( ) ;
224+ const sanitized = sanitizeDomainName ( domainLower ) ;
225+ const truncated = domainLower . length > 12 ? domainLower . substring ( 0 , 12 ) + "..." : domainLower ;
226+ core . info ( `Redacted URL: ${ truncated } ` ) ;
227+ core . debug ( `Redacted URL (full): ${ fullMatch } ` ) ;
228+ addRedactedDomain ( domainLower ) ;
229+ return sanitized ? `(${ sanitized } /redacted)` : "(redacted)" ;
230+ } ) ;
231+
232+ // ── Step 2: blocklist-based single-colon scheme filtering ───────────────────
233+ // For schemes that do not use "//", a targeted blocklist is used because a
234+ // fully general single-colon pattern produces too many false positives
235+ // (e.g. "key:value" in YAML, "std::vector" in C++, "C:\path" on Windows).
236+ return result . replace ( / (?: d a t a | j a v a s c r i p t | v b s c r i p t | a b o u t | m a i l t o | t e l | m a g n e t ) : [ ^ \s ] + / gi, match => {
237+ const protocolMatch = match . match ( / ^ ( [ ^ : ] + ) : / ) ;
238+ if ( protocolMatch ) {
239+ const protocol = protocolMatch [ 1 ] + ":" ;
240+ const truncated = match . length > 12 ? match . substring ( 0 , 12 ) + "..." : match ;
241+ core . info ( `Redacted URL: ${ truncated } ` ) ;
242+ core . debug ( `Redacted URL (full): ${ match } ` ) ;
243+ addRedactedDomain ( protocol ) ;
244+ }
245+ return "(redacted)" ;
229246 } ) ;
230247}
231248
0 commit comments