Skip to content

Commit 72c4591

Browse files
authored
Normalize NUL bytes to U+FFFD at parse entry (#248)
A raw NUL (U+0000) must never reach rendered output. Replace it with the U+FFFD replacement character at the parse entry (WHATWG-style, decided cross-impl behavior), so a control byte cannot survive into HTML. SafeMode now also strips U+FFFD from a URL scheme, so a `java\x00script:` evasion - which arrives as `java\u{FFFD}script:` after normalization - is still detected and blocked (empty href). Ported from carve-php commit ff40264.
1 parent 306e362 commit 72c4591

3 files changed

Lines changed: 41 additions & 3 deletions

File tree

src/Parser/BlockParser.php

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -496,6 +496,14 @@ public function parse(string $input): Document
496496
$this->headingReferenceLabels = [];
497497
$this->lineOffset = 0;
498498
$document = new Document();
499+
500+
// Replace any NUL (U+0000) with the U+FFFD replacement character so a
501+
// control byte never reaches output (decided cross-impl behavior;
502+
// WHATWG-style). A raw NUL must never reach rendered output.
503+
if (str_contains($input, "\0")) {
504+
$input = str_replace("\0", "\u{FFFD}", $input);
505+
}
506+
499507
$lines = $this->splitLines($input);
500508

501509
// First pass: extract reference definitions, footnotes, abbreviations, and heading references
@@ -561,7 +569,7 @@ protected function extractReferences(array $lines): void
561569
// `"Title"` makes the line not a reference definition (matches djot.js)
562570
if (preg_match('/^\[([^\]]+)\]:(?:[ \t]+(\S*))?[ \t]*$/', $line, $matches)) {
563571
// Normalize label: collapse whitespace, trim
564-
$label = preg_replace('/\s+/', ' ', trim($matches[1]));
572+
$label = preg_replace('/\s+/', ' ', trim($matches[1])) ?? '';
565573
$url = trim($matches[2] ?? '');
566574

567575
// Collect continuation lines (URL can start on continuation line)

src/SafeMode.php

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -228,8 +228,11 @@ public function isUrlSafe(string $url): bool
228228
$scheme = substr($url, 0, $colonPos);
229229

230230
// Normalize scheme: strip ASCII whitespace and control characters (0x00-0x20)
231-
// This prevents bypass attempts like "java\tscript:", "java\x0bscript:", etc.
232-
$scheme = preg_replace('/[\x00-\x20]+/', '', $scheme) ?? $scheme;
231+
// plus the U+FFFD replacement character (a NUL becomes U+FFFD at the
232+
// parse entry, so a `java\x00script:` evasion arrives as
233+
// `java\u{FFFD}script:`). This prevents bypass attempts like
234+
// "java\tscript:", "java\x0bscript:", "java\x00script:", etc.
235+
$scheme = preg_replace('/[\x00-\x20]+|\x{FFFD}+/u', '', $scheme) ?? $scheme;
233236
$scheme = strtolower($scheme);
234237

235238
// Check against dangerous schemes

tests/TestCase/Parser/BlockParserTest.php

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -703,4 +703,31 @@ public function testRawBlockTrimsLeadingAndTrailingBlankLines(): void
703703
$this->assertCount(1, $children);
704704
$this->assertSame('<b>bold</b>', $children[0]->getContent());
705705
}
706+
707+
public function testNulByteIsReplacedWithReplacementCharacter(): void
708+
{
709+
// A raw NUL (U+0000) must never reach rendered output: it is normalized
710+
// to the U+FFFD replacement character at the parse entry (WHATWG-style,
711+
// decided cross-impl behavior).
712+
$converter = new DjotConverter();
713+
$html = $converter->convert("a\0b");
714+
715+
$this->assertStringNotContainsString("\0", $html, 'A raw NUL byte must not reach output');
716+
$this->assertStringContainsString("\u{FFFD}", $html, 'NUL must be normalized to U+FFFD');
717+
$this->assertStringContainsString("a\u{FFFD}b", $html);
718+
}
719+
720+
public function testNulByteInLinkSchemeDoesNotProduceExecutableHref(): void
721+
{
722+
// A `java\x00script:` evasion arrives as `java\u{FFFD}script:` after NUL
723+
// normalization. SafeMode also strips U+FFFD from a scheme, so the
724+
// evasion is still detected and blocked (empty href), and no raw NUL or
725+
// executable javascript: scheme may reach output.
726+
$converter = new DjotConverter(safeMode: true);
727+
$html = $converter->convert("[x](java\0script:alert(1))");
728+
729+
$this->assertStringNotContainsString("\0", $html, 'A raw NUL byte must not reach output');
730+
$this->assertStringNotContainsString('javascript:', $html, 'No executable javascript: scheme');
731+
$this->assertStringContainsString('href=""', $html, 'NUL-evasion javascript: scheme is blocked');
732+
}
706733
}

0 commit comments

Comments
 (0)