Skip to content

Commit f066e5f

Browse files
authored
security: always-on attribute + URL hardening; cap inline nesting (#253)
Dangerous content was only neutralized when safe mode was explicitly enabled, but safe mode is off by default - so a plain HtmlRenderer emitted javascript: hrefs, on* handlers, srcdoc/formaction and CSS expression() unfiltered. Apply a baseline that runs regardless of safe mode; safe mode still layers stricter filtering (raw-HTML policy, style, allowlists) on top. - HtmlRenderer: strip on*/srcdoc/formaction attribute names and blank values carrying a dangerous URL scheme (javascript/vbscript/data/file) or CSS expression()/url()/@import/behavior, always-on. href/src get the dangerous scheme denylist before the safe-mode allowlist. Scheme detection normalizes C0 controls + spaces to defeat java\tscript: evasion. - InlineParser: cap inline recursion (deeply nested links/emphasis rescanned balanced brackets at each level, ~O(n^2) - a few thousand levels took ~10s); beyond the cap the remaining text is emitted literally. - SafeModeTest: the two cases that asserted dangerous URLs pass through with safe mode off now assert the always-on baseline, demonstrating the off-by- default distinction via raw-HTML passthrough instead.
1 parent 78f8aec commit f066e5f

4 files changed

Lines changed: 274 additions & 12 deletions

File tree

src/Parser/InlineParser.php

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,19 @@ class InlineParser
4747
*/
4848
private const INLINE_SPECIAL_CHARS = "\\\n\$`:![<_*^~{\"'-.";
4949

50+
/**
51+
* Maximum inline-recursion depth before remaining text is emitted literally
52+
* (DoS guard, see parseInlines()). Far deeper than any real document.
53+
*
54+
* @var int
55+
*/
56+
protected const MAX_INLINE_DEPTH = 100;
57+
58+
/**
59+
* Current inline-recursion depth (see self::MAX_INLINE_DEPTH).
60+
*/
61+
protected int $inlineDepth = 0;
62+
5063
/**
5164
* @var array<array{type: string, char: string, pos: int, node: \Djot\Node\Node}>
5265
*/
@@ -215,6 +228,29 @@ public function parse(Node $parent, string $text, int $sourceLine = 0): void
215228
}
216229

217230
protected function parseInlines(Node $parent, string $text): void
231+
{
232+
// Inline-nesting DoS guard: deeply nested inline constructs (e.g. a bomb
233+
// of nested links `[[[...](#)](#)...`) recurse through parseInlines and
234+
// rescan balanced brackets at each level, which is ~O(n^2). Beyond this
235+
// depth the remaining text is emitted literally instead of recursing
236+
// further. Far deeper than any real document.
237+
if ($this->inlineDepth >= self::MAX_INLINE_DEPTH) {
238+
if ($text !== '') {
239+
$parent->appendChild(new Text($text));
240+
}
241+
242+
return;
243+
}
244+
245+
$this->inlineDepth++;
246+
try {
247+
$this->parseInlinesImpl($parent, $text);
248+
} finally {
249+
$this->inlineDepth--;
250+
}
251+
}
252+
253+
protected function parseInlinesImpl(Node $parent, string $text): void
218254
{
219255
$length = strlen($text);
220256
$pos = 0;

src/Renderer/HtmlRenderer.php

Lines changed: 127 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -927,9 +927,13 @@ protected function renderLink(Link $node): string
927927
$href = $node->getDestination();
928928
$title = $node->getTitle();
929929

930-
// Sanitize URL in safe mode
931-
if ($this->safeMode !== null && $href !== null) {
932-
$href = $this->safeMode->sanitizeUrl($href);
930+
// Always-on URL hardening (dangerous scheme denylist), then safe mode
931+
// may apply a stricter allowlist on top.
932+
if ($href !== null) {
933+
$href = $this->sanitizeUrlBaseline($href);
934+
if ($this->safeMode !== null) {
935+
$href = $this->safeMode->sanitizeUrl($href);
936+
}
933937
}
934938

935939
$html = '<a';
@@ -963,7 +967,9 @@ protected function renderImage(Image $node): string
963967
$src = $node->getSource();
964968
$title = $node->getTitle();
965969

966-
// Sanitize URL in safe mode
970+
// Always-on URL hardening (dangerous scheme denylist), then safe mode
971+
// may apply a stricter allowlist on top.
972+
$src = $this->sanitizeUrlBaseline($src);
967973
if ($this->safeMode !== null) {
968974
$src = $this->safeMode->sanitizeUrl($src);
969975
}
@@ -1083,14 +1089,130 @@ protected function getRenderableAttributes(Node $node, array $exclude = []): arr
10831089
$attrs = array_diff_key($attrs, array_flip($exclude));
10841090
}
10851091

1086-
// Filter dangerous attributes in safe mode
1092+
// Always-on hardening: strip event handlers / injection sinks and
1093+
// neutralize dangerous attribute values regardless of safe mode.
1094+
$attrs = $this->sanitizeAttributes($attrs);
1095+
1096+
// Safe mode may strip additional names (e.g. style under strict) on top.
10871097
if ($this->safeMode !== null) {
10881098
$attrs = $this->safeMode->filterAttributes($attrs);
10891099
}
10901100

10911101
return $attrs;
10921102
}
10931103

1104+
/**
1105+
* URL schemes that are always neutralized in attribute values and in
1106+
* `href` / `src`, regardless of safe mode.
1107+
*
1108+
* @var array<string>
1109+
*/
1110+
private const DANGEROUS_VALUE_SCHEMES = ['javascript', 'vbscript', 'data', 'file'];
1111+
1112+
/**
1113+
* Always-on attribute hardening, applied regardless of safe mode.
1114+
*
1115+
* Drops event-handler names (`on*`) and the injection sinks `srcdoc` /
1116+
* `formaction`, and blanks a value carrying a dangerous URL scheme or a CSS
1117+
* `expression(...)`. Safe mode (when set) filters further on top. Attribute
1118+
* names are otherwise left as-is (their well-formedness is the parser's job).
1119+
*
1120+
* @param array<string, string> $attrs
1121+
*
1122+
* @return array<string, string>
1123+
*/
1124+
protected function sanitizeAttributes(array $attrs): array
1125+
{
1126+
$out = [];
1127+
foreach ($attrs as $key => $value) {
1128+
$name = strtolower((string)$key);
1129+
if (str_starts_with($name, 'on') || $name === 'srcdoc' || $name === 'formaction') {
1130+
continue;
1131+
}
1132+
$out[$key] = $this->sanitizeAttributeValue($name, (string)$value);
1133+
}
1134+
1135+
return $out;
1136+
}
1137+
1138+
/**
1139+
* Blank an attribute value that carries a dangerous URL scheme or a CSS
1140+
* `expression(...)`. The scheme is normalized (C0 controls + spaces removed)
1141+
* before comparison to defeat `java\tscript:` style evasion.
1142+
*/
1143+
protected function sanitizeAttributeValue(string $name, string $value): string
1144+
{
1145+
$colon = strpos($value, ':');
1146+
if ($colon !== false) {
1147+
$scheme = strtolower((string)preg_replace('/[\x00-\x20]+/', '', substr($value, 0, $colon)));
1148+
if (in_array($scheme, self::DANGEROUS_VALUE_SCHEMES, true)) {
1149+
return '';
1150+
}
1151+
}
1152+
if ($name === 'style' && $this->hasDangerousCss($value)) {
1153+
return '';
1154+
}
1155+
1156+
return $value;
1157+
}
1158+
1159+
/**
1160+
* Detect script-bearing / fetching constructs in a CSS `style` value:
1161+
* `expression()` (legacy IE script), `url(...)` (can fetch or carry
1162+
* `javascript:`), `@import`, and the legacy `behavior` / `-moz-binding`
1163+
* script bindings. CSS escapes are decoded and whitespace collapsed first so
1164+
* `expr\65 ssion(` / `expr ession (` cannot evade.
1165+
*/
1166+
protected function hasDangerousCss(string $value): bool
1167+
{
1168+
$withoutComments = preg_replace('/\/\*.*?\*\//s', '', $value) ?? $value;
1169+
$decoded = preg_replace_callback(
1170+
'/\\\\([0-9A-Fa-f]{1,6}\s?|.)/s',
1171+
static function (array $m): string {
1172+
$escape = $m[1];
1173+
if (preg_match('/^([0-9A-Fa-f]{1,6})\s?$/', $escape, $hex) === 1) {
1174+
$codepoint = (int)hexdec($hex[1]);
1175+
if ($codepoint <= 0 || $codepoint > 0x10FFFF) {
1176+
return '';
1177+
}
1178+
1179+
return mb_chr($codepoint, 'UTF-8');
1180+
}
1181+
1182+
return $escape;
1183+
},
1184+
$withoutComments,
1185+
) ?? $withoutComments;
1186+
$compact = strtolower((string)preg_replace('/\s+/', '', $decoded));
1187+
1188+
return str_contains($compact, 'expression(')
1189+
|| str_contains($compact, 'url(')
1190+
|| str_contains($compact, '@import')
1191+
|| str_contains($compact, 'behavior:')
1192+
|| str_contains($compact, '-moz-binding');
1193+
}
1194+
1195+
/**
1196+
* Always-on URL hardening for `href` / `src`, independent of safe mode.
1197+
*
1198+
* Blanks a URL whose (normalized) scheme is one of the dangerous denylist
1199+
* schemes (`javascript`, `vbscript`, `data`, `file`); every other scheme and
1200+
* any scheme-less URL passes. Safe mode may apply a stricter allowlist on
1201+
* top. Scheme detection strips C0 controls + spaces to defeat `java\tscript:`
1202+
* evasion.
1203+
*/
1204+
protected function sanitizeUrlBaseline(string $url): string
1205+
{
1206+
$probe = (string)preg_replace('/[\x00-\x20]+/', '', $url);
1207+
if (preg_match('/^([a-zA-Z][a-zA-Z0-9+.\-]*):/', $probe, $m) === 1) {
1208+
if (in_array(strtolower($m[1]), self::DANGEROUS_VALUE_SCHEMES, true)) {
1209+
return '';
1210+
}
1211+
}
1212+
1213+
return $url;
1214+
}
1215+
10941216
/**
10951217
* @param array<string, string> $attrs
10961218
*/
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace Djot\Test\TestCase\Renderer;
6+
7+
use Djot\DjotConverter;
8+
use PHPUnit\Framework\TestCase;
9+
10+
/**
11+
* Security hardening that applies to the DEFAULT renderer (no safe mode set).
12+
*
13+
* Dangerous URL schemes, event-handler / injection-sink attributes, CSS
14+
* `expression()` and unbounded inline nesting must be neutralized even when the
15+
* caller never opts into safe mode. Safe mode (tested in {@see \Djot\Test\SafeModeTest})
16+
* layers stricter filtering on top.
17+
*/
18+
class AlwaysOnHardeningTest extends TestCase
19+
{
20+
protected DjotConverter $converter;
21+
22+
protected function setUp(): void
23+
{
24+
// No safe mode configured: this is the default converter.
25+
$this->converter = new DjotConverter();
26+
}
27+
28+
public function testDangerousLinkSchemeBlankedWithoutSafeMode(): void
29+
{
30+
$result = $this->converter->convert('[x](javascript:alert(1))');
31+
$this->assertStringNotContainsString('javascript:', $result);
32+
$this->assertStringContainsString('<a href="">x</a>', $result);
33+
}
34+
35+
public function testDangerousImageSchemeBlankedWithoutSafeMode(): void
36+
{
37+
$result = $this->converter->convert('![a](vbscript:msgbox(1))');
38+
$this->assertStringNotContainsString('vbscript:', $result);
39+
}
40+
41+
public function testSchemeEvasionWithControlCharIsBlanked(): void
42+
{
43+
// `java\tscript:` must not slip past the denylist.
44+
$result = $this->converter->convert("[x](java\tscript:alert(1))");
45+
$this->assertStringNotContainsString('script:', $result);
46+
}
47+
48+
public function testSafeSchemeAndRelativeUrlPreserved(): void
49+
{
50+
$this->assertStringContainsString(
51+
'href="https://ok.example"',
52+
$this->converter->convert('[x](https://ok.example)'),
53+
);
54+
$this->assertStringContainsString(
55+
'href="/local/page"',
56+
$this->converter->convert('[x](/local/page)'),
57+
);
58+
}
59+
60+
public function testEventHandlerAttributeStrippedWithoutSafeMode(): void
61+
{
62+
$result = $this->converter->convert('[x](https://a){onclick="alert(1)"}');
63+
$this->assertStringNotContainsString('onclick', $result);
64+
}
65+
66+
public function testInjectionSinkAttributesStrippedWithoutSafeMode(): void
67+
{
68+
$result = $this->converter->convert('text{srcdoc="x" formaction="y"}');
69+
$this->assertStringNotContainsString('srcdoc', $result);
70+
$this->assertStringNotContainsString('formaction', $result);
71+
}
72+
73+
public function testCssExpressionBlankedWithoutSafeMode(): void
74+
{
75+
$result = $this->converter->convert('text{style="x:expression(alert(1))"}');
76+
$this->assertStringNotContainsString('expression(', $result);
77+
}
78+
79+
public function testDangerousAttributeValueSchemeBlanked(): void
80+
{
81+
// A non-href attribute may still carry a javascript: payload.
82+
$result = $this->converter->convert('text{data-x="javascript:alert(1)"}');
83+
$this->assertStringNotContainsString('javascript:', $result);
84+
}
85+
86+
public function testDeeplyNestedInlineDoesNotBlowUp(): void
87+
{
88+
$depth = 4000;
89+
$bomb = str_repeat('[', $depth) . 'x' . str_repeat('](u)', $depth);
90+
91+
$start = microtime(true);
92+
$result = $this->converter->convert($bomb);
93+
$elapsed = microtime(true) - $start;
94+
95+
// Without the cap this is ~O(n^2) (seconds); the guard keeps it fast.
96+
$this->assertLessThan(2.0, $elapsed, 'inline nesting DoS guard did not bound parse time');
97+
$this->assertNotSame('', $result);
98+
}
99+
}

tests/TestCase/SafeModeTest.php

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -243,11 +243,14 @@ public function testRawHtmlAllowedWhenConfigured(): void
243243
public function testSafeModeDisabledByDefault(): void
244244
{
245245
$converter = new DjotConverter();
246-
$djot = '[click](javascript:alert(1))';
247-
$result = $converter->convert($djot);
248246

249-
// Without safe mode, dangerous URLs are allowed
250-
$this->assertStringContainsString('javascript:alert(1)', $result);
247+
// Safe-mode-only filtering is not applied by default: raw HTML passes
248+
// through instead of being escaped.
249+
$this->assertStringContainsString('<b>bold</b>', $converter->convert('`<b>bold</b>`{=html}'));
250+
251+
// Always-on hardening still neutralizes dangerous URL schemes even with
252+
// safe mode off (there is no legitimate reason to emit javascript:).
253+
$this->assertStringNotContainsString('javascript:', $converter->convert('[click](javascript:alert(1))'));
251254
}
252255

253256
public function testSafeModeCanBeEnabledAfterConstruction(): void
@@ -266,10 +269,12 @@ public function testSafeModeCanBeDisabledAfterConstruction(): void
266269
$converter = new DjotConverter(safeMode: true);
267270
$converter->setSafeMode(false);
268271

269-
$djot = '[click](javascript:alert(1))';
270-
$result = $converter->convert($djot);
272+
// Disabling safe mode restores the safe-mode-gated behavior (raw HTML
273+
// passes through again)...
274+
$this->assertStringContainsString('<b>bold</b>', $converter->convert('`<b>bold</b>`{=html}'));
271275

272-
$this->assertStringContainsString('javascript:alert(1)', $result);
276+
// ...but the always-on URL hardening still applies regardless.
277+
$this->assertStringNotContainsString('javascript:', $converter->convert('[click](javascript:alert(1))'));
273278
}
274279

275280
public function testCustomSafeModeConfiguration(): void

0 commit comments

Comments
 (0)