Skip to content

Commit 10ebe53

Browse files
committed
Page Content: Added more complex & configurable content filtering
- Added new option to control parts of the filter. - Added whitelist filtering pass via HTMLPurifier.
1 parent 46dcc30 commit 10ebe53

File tree

10 files changed

+292
-63
lines changed

10 files changed

+292
-63
lines changed

app/Activity/Models/Comment.php

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
use BookStack\Users\Models\HasCreatorAndUpdater;
99
use BookStack\Users\Models\OwnableInterface;
1010
use BookStack\Util\HtmlContentFilter;
11+
use BookStack\Util\HtmlContentFilterConfig;
1112
use Illuminate\Database\Eloquent\Builder;
1213
use Illuminate\Database\Eloquent\Factories\HasFactory;
1314
use Illuminate\Database\Eloquent\Relations\BelongsTo;
@@ -82,7 +83,8 @@ public function logDescriptor(): string
8283

8384
public function safeHtml(): string
8485
{
85-
return HtmlContentFilter::removeActiveContentFromHtmlString($this->html ?? '');
86+
$filter = new HtmlContentFilter(new HtmlContentFilterConfig());
87+
return $filter->filterString($this->html ?? '');
8688
}
8789

8890
public function jointPermissions(): HasMany

app/Config/app.php

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,18 @@
4242
// Even when overridden the WYSIWYG editor may still escape script content.
4343
'allow_content_scripts' => env('ALLOW_CONTENT_SCRIPTS', false),
4444

45+
// Control the behaviour of page content filtering.
46+
// This setting is a collection of characters which represent different available filters:
47+
// - j - Filter out JavaScript based content
48+
// - h - Filter out unexpected, potentially dangerous, HTML elements
49+
// - f - Filter out unexpected form elements
50+
// - a - Run content through a more complex allow-list filter
51+
// This defaults to using all filters, unless ALLOW_CONTENT_SCRIPTS is set to true in which case no filters are used.
52+
// Note: These filters are a best attempt, and may not be 100% effective. They are typically a layer used in addition to other security measures.
53+
// TODO - Add to example env
54+
// TODO - Remove allow_content_scripts option above
55+
'content_filtering' => env('CONTENT_FILTERING', env('ALLOW_CONTENT_SCRIPTS', false) === true ? '' : 'jfha'),
56+
4557
// Allow server-side fetches to be performed to potentially unknown
4658
// and user-provided locations. Primarily used in exports when loading
4759
// in externally referenced assets.

app/Entities/Tools/EntityHtmlDescription.php

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
use BookStack\Entities\Models\Bookshelf;
77
use BookStack\Entities\Models\Chapter;
88
use BookStack\Util\HtmlContentFilter;
9+
use BookStack\Util\HtmlContentFilterConfig;
910

1011
class EntityHtmlDescription
1112
{
@@ -50,7 +51,8 @@ public function getHtml(bool $raw = false): string
5051
return $html;
5152
}
5253

53-
return HtmlContentFilter::removeActiveContentFromHtmlString($html);
54+
$filter = new HtmlContentFilter(new HtmlContentFilterConfig());
55+
return $filter->filterString($html);
5456
}
5557

5658
public function getPlain(): string

app/Entities/Tools/PageContent.php

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
use BookStack\Uploads\ImageService;
1414
use BookStack\Users\Models\User;
1515
use BookStack\Util\HtmlContentFilter;
16+
use BookStack\Util\HtmlContentFilterConfig;
1617
use BookStack\Util\HtmlDocument;
1718
use BookStack\Util\WebSafeMimeSniffer;
1819
use Closure;
@@ -317,11 +318,28 @@ public function render(bool $blankIncludes = false): string
317318
$this->updateIdsRecursively($doc->getBody(), 0, $idMap, $changeMap);
318319
}
319320

320-
if (!config('app.allow_content_scripts')) {
321-
HtmlContentFilter::removeActiveContentFromDocument($doc);
321+
$cacheKey = $this->getContentCacheKey($doc->getBodyInnerHtml());
322+
$cached = cache()->get($cacheKey, null);
323+
if ($cached !== null) {
324+
return $cached;
322325
}
323326

324-
return $doc->getBodyInnerHtml();
327+
$filterConfig = HtmlContentFilterConfig::fromConfigString(config('app.content_filtering'));
328+
$filter = new HtmlContentFilter($filterConfig);
329+
$filtered = $filter->filterDocument($doc);
330+
331+
$cacheTime = 86400 * 7; // 1 week
332+
cache()->put($cacheKey, $filtered, $cacheTime);
333+
334+
return $filtered;
335+
}
336+
337+
protected function getContentCacheKey(string $html): string
338+
{
339+
$contentHash = md5($html);
340+
$contentId = $this->page->id;
341+
$contentTime = $this->page->updated_at->timestamp;
342+
return "page-content-cache::{$contentId}::{$contentTime}::{$contentHash}";
325343
}
326344

327345
/**

app/Theming/CustomHtmlHeadContentProvider.php

Lines changed: 7 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -4,25 +4,16 @@
44

55
use BookStack\Util\CspService;
66
use BookStack\Util\HtmlContentFilter;
7+
use BookStack\Util\HtmlContentFilterConfig;
78
use BookStack\Util\HtmlNonceApplicator;
89
use Illuminate\Contracts\Cache\Repository as Cache;
910

1011
class CustomHtmlHeadContentProvider
1112
{
12-
/**
13-
* @var CspService
14-
*/
15-
protected $cspService;
16-
17-
/**
18-
* @var Cache
19-
*/
20-
protected $cache;
21-
22-
public function __construct(CspService $cspService, Cache $cache)
23-
{
24-
$this->cspService = $cspService;
25-
$this->cache = $cache;
13+
public function __construct(
14+
protected CspService $cspService,
15+
protected Cache $cache
16+
) {
2617
}
2718

2819
/**
@@ -50,7 +41,8 @@ public function forExport(): string
5041
$hash = md5($content);
5142

5243
return $this->cache->remember('custom-head-export:' . $hash, 86400, function () use ($content) {
53-
return HtmlContentFilter::removeActiveContentFromHtmlString($content);
44+
$config = new HtmlContentFilterConfig(filterOutNonContentElements: false);
45+
return (new HtmlContentFilter($config))->filterString($content);
5446
});
5547
}
5648

app/Util/HtmlContentFilter.php

Lines changed: 88 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,53 @@
55
use DOMAttr;
66
use DOMElement;
77
use DOMNodeList;
8+
use HTMLPurifier;
9+
use HTMLPurifier_HTML5Config;
810

911
class HtmlContentFilter
1012
{
11-
/**
12-
* Remove all active content from the given HTML document.
13-
* This aims to cover anything which can dynamically deal with, or send, data
14-
* like any JavaScript actions or form content.
15-
*/
16-
public static function removeActiveContentFromDocument(HtmlDocument $doc): void
13+
public function __construct(
14+
protected HtmlContentFilterConfig $config
15+
) {
16+
}
17+
18+
public function filterDocument(HtmlDocument $doc): string
19+
{
20+
if ($this->config->filterOutJavaScript) {
21+
$this->filterOutScriptsFromDocument($doc);
22+
}
23+
if ($this->config->filterOutFormElements) {
24+
$this->filterOutFormElementsFromDocument($doc);
25+
}
26+
if ($this->config->filterOutBadHtmlElements) {
27+
$this->filterOutBadHtmlElementsFromDocument($doc);
28+
}
29+
if ($this->config->filterOutNonContentElements) {
30+
$this->filterOutNonContentElementsFromDocument($doc);
31+
}
32+
33+
$filtered = $doc->getBodyInnerHtml();
34+
if ($this->config->useAllowListFilter) {
35+
$filtered = $this->applyAllowListFiltering($filtered);
36+
}
37+
38+
return $filtered;
39+
}
40+
41+
public function filterString(string $html): string
42+
{
43+
return $this->filterDocument(new HtmlDocument($html));
44+
}
45+
46+
protected function applyAllowListFiltering(string $html): string
47+
{
48+
$config = HTMLPurifier_HTML5Config::createDefault();
49+
$config->set('Cache.SerializerPath', storage_path('purifier'));
50+
$purifier = new HTMLPurifier($config);
51+
return $purifier->purify($html);
52+
}
53+
54+
protected function filterOutScriptsFromDocument(HtmlDocument $doc): void
1755
{
1856
// Remove standard script tags
1957
$scriptElems = $doc->queryXPath('//script');
@@ -27,10 +65,6 @@ public static function removeActiveContentFromDocument(HtmlDocument $doc): void
2765
$badForms = $doc->queryXPath('//*[' . static::xpathContains('@action', 'javascript:') . '] | //*[' . static::xpathContains('@formaction', 'javascript:') . ']');
2866
static::removeNodes($badForms);
2967

30-
// Remove meta tag to prevent external redirects
31-
$metaTags = $doc->queryXPath('//meta[' . static::xpathContains('@content', 'url') . ']');
32-
static::removeNodes($metaTags);
33-
3468
// Remove data or JavaScript iFrames
3569
$badIframes = $doc->queryXPath('//*[' . static::xpathContains('@src', 'data:') . '] | //*[' . static::xpathContains('@src', 'javascript:') . '] | //*[@srcdoc]');
3670
static::removeNodes($badIframes);
@@ -49,7 +83,10 @@ public static function removeActiveContentFromDocument(HtmlDocument $doc): void
4983
// Remove 'on*' attributes
5084
$onAttributes = $doc->queryXPath('//@*[starts-with(name(), \'on\')]');
5185
static::removeAttributes($onAttributes);
86+
}
5287

88+
protected function filterOutFormElementsFromDocument(HtmlDocument $doc): void
89+
{
5390
// Remove form elements
5491
$formElements = ['form', 'fieldset', 'button', 'textarea', 'select'];
5592
foreach ($formElements as $formElement) {
@@ -75,41 +112,21 @@ public static function removeActiveContentFromDocument(HtmlDocument $doc): void
75112
}
76113
}
77114

78-
/**
79-
* Remove active content from the given HTML string.
80-
* This aims to cover anything which can dynamically deal with, or send, data
81-
* like any JavaScript actions or form content.
82-
*/
83-
public static function removeActiveContentFromHtmlString(string $html): string
115+
protected function filterOutBadHtmlElementsFromDocument(HtmlDocument $doc): void
84116
{
85-
if (empty($html)) {
86-
return $html;
87-
}
88-
89-
$doc = new HtmlDocument($html);
90-
static::removeActiveContentFromDocument($doc);
91-
92-
return $doc->getBodyInnerHtml();
93-
}
94-
95-
/**
96-
* Alias using the old method name to avoid potential compatibility breaks during patch release.
97-
* To remove in future feature release.
98-
* @deprecated Use removeActiveContentFromDocument instead.
99-
*/
100-
public static function removeScriptsFromDocument(HtmlDocument $doc): void
101-
{
102-
static::removeActiveContentFromDocument($doc);
117+
// Remove meta tag to prevent external redirects
118+
$metaTags = $doc->queryXPath('//meta[' . static::xpathContains('@content', 'url') . ']');
119+
static::removeNodes($metaTags);
103120
}
104121

105-
/**
106-
* Alias using the old method name to avoid potential compatibility breaks during patch release.
107-
* To remove in future feature release.
108-
* @deprecated Use removeActiveContentFromHtmlString instead.
109-
*/
110-
public static function removeScriptsFromHtmlString(string $html): string
122+
protected function filterOutNonContentElementsFromDocument(HtmlDocument $doc): void
111123
{
112-
return static::removeActiveContentFromHtmlString($html);
124+
// Remove non-content elements
125+
$formElements = ['link', 'style', 'meta', 'title', 'template'];
126+
foreach ($formElements as $formElement) {
127+
$matchingFormElements = $doc->queryXPath('//' . $formElement);
128+
static::removeNodes($matchingFormElements);
129+
}
113130
}
114131

115132
/**
@@ -147,4 +164,34 @@ protected static function removeAttributes(DOMNodeList $attrs): void
147164
$parentNode->removeAttribute($attrName);
148165
}
149166
}
167+
168+
/**
169+
* Alias using the old method name to avoid potential compatibility breaks during patch release.
170+
* To remove in future feature release.
171+
* @deprecated Use filterDocument instead.
172+
*/
173+
public static function removeScriptsFromDocument(HtmlDocument $doc): void
174+
{
175+
$config = new HtmlContentFilterConfig(
176+
filterOutNonContentElements: false,
177+
useAllowListFilter: false,
178+
);
179+
$filter = new static($config);
180+
$filter->filterDocument($doc);
181+
}
182+
183+
/**
184+
* Alias using the old method name to avoid potential compatibility breaks during patch release.
185+
* To remove in future feature release.
186+
* @deprecated Use filterString instead.
187+
*/
188+
public static function removeScriptsFromHtmlString(string $html): string
189+
{
190+
$config = new HtmlContentFilterConfig(
191+
filterOutNonContentElements: false,
192+
useAllowListFilter: false,
193+
);
194+
$filter = new static($config);
195+
return $filter->filterString($html);
196+
}
150197
}
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
<?php
2+
3+
namespace BookStack\Util;
4+
5+
readonly class HtmlContentFilterConfig
6+
{
7+
public function __construct(
8+
public bool $filterOutJavaScript = true,
9+
public bool $filterOutBadHtmlElements = true,
10+
public bool $filterOutFormElements = true,
11+
public bool $filterOutNonContentElements = true,
12+
public bool $useAllowListFilter = true,
13+
) {
14+
}
15+
16+
/**
17+
* Create an instance from a config string, where the string
18+
* is a combination of characters to enable filters.
19+
*/
20+
public static function fromConfigString(string $config): self
21+
{
22+
$config = strtolower($config);
23+
return new self(
24+
filterOutJavaScript: str_contains($config, 'j'),
25+
filterOutBadHtmlElements: str_contains($config, 'h'),
26+
filterOutFormElements: str_contains($config, 'f'),
27+
filterOutNonContentElements: str_contains($config, 'h'),
28+
useAllowListFilter: str_contains($config, 'a'),
29+
);
30+
}
31+
}

composer.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
"ext-zip": "*",
2020
"bacon/bacon-qr-code": "^3.0",
2121
"dompdf/dompdf": "^3.1",
22+
"ezyang/htmlpurifier": "^4.19",
2223
"guzzlehttp/guzzle": "^7.4",
2324
"intervention/image": "^3.5",
2425
"knplabs/knp-snappy": "^1.5",
@@ -38,7 +39,8 @@
3839
"socialiteproviders/microsoft-azure": "^5.1",
3940
"socialiteproviders/okta": "^4.2",
4041
"socialiteproviders/twitch": "^5.3",
41-
"ssddanbrown/htmldiff": "^2.0.0"
42+
"ssddanbrown/htmldiff": "^2.0.0",
43+
"xemlock/htmlpurifier-html5": "^0.1.12"
4244
},
4345
"require-dev": {
4446
"fakerphp/faker": "^1.21",

0 commit comments

Comments
 (0)