55use DOMAttr ;
66use DOMElement ;
77use DOMNodeList ;
8+ use HTMLPurifier ;
9+ use HTMLPurifier_HTML5Config ;
810
911class HtmlContentFilter
1012{
11- /**
12- * Remove all active content from the given HTML document.
13- * This aims to cover anything which can dynamically deal with, or send, data
14- * like any JavaScript actions or form content.
15- */
16- public static function removeActiveContentFromDocument (HtmlDocument $ doc ): void
13+ public function __construct (
14+ protected HtmlContentFilterConfig $ config
15+ ) {
16+ }
17+
18+ public function filterDocument (HtmlDocument $ doc ): string
19+ {
20+ if ($ this ->config ->filterOutJavaScript ) {
21+ $ this ->filterOutScriptsFromDocument ($ doc );
22+ }
23+ if ($ this ->config ->filterOutFormElements ) {
24+ $ this ->filterOutFormElementsFromDocument ($ doc );
25+ }
26+ if ($ this ->config ->filterOutBadHtmlElements ) {
27+ $ this ->filterOutBadHtmlElementsFromDocument ($ doc );
28+ }
29+ if ($ this ->config ->filterOutNonContentElements ) {
30+ $ this ->filterOutNonContentElementsFromDocument ($ doc );
31+ }
32+
33+ $ filtered = $ doc ->getBodyInnerHtml ();
34+ if ($ this ->config ->useAllowListFilter ) {
35+ $ filtered = $ this ->applyAllowListFiltering ($ filtered );
36+ }
37+
38+ return $ filtered ;
39+ }
40+
41+ public function filterString (string $ html ): string
42+ {
43+ return $ this ->filterDocument (new HtmlDocument ($ html ));
44+ }
45+
46+ protected function applyAllowListFiltering (string $ html ): string
47+ {
48+ $ config = HTMLPurifier_HTML5Config::createDefault ();
49+ $ config ->set ('Cache.SerializerPath ' , storage_path ('purifier ' ));
50+ $ purifier = new HTMLPurifier ($ config );
51+ return $ purifier ->purify ($ html );
52+ }
53+
54+ protected function filterOutScriptsFromDocument (HtmlDocument $ doc ): void
1755 {
1856 // Remove standard script tags
1957 $ scriptElems = $ doc ->queryXPath ('//script ' );
@@ -27,10 +65,6 @@ public static function removeActiveContentFromDocument(HtmlDocument $doc): void
2765 $ badForms = $ doc ->queryXPath ('//*[ ' . static ::xpathContains ('@action ' , 'javascript: ' ) . '] | //*[ ' . static ::xpathContains ('@formaction ' , 'javascript: ' ) . '] ' );
2866 static ::removeNodes ($ badForms );
2967
30- // Remove meta tag to prevent external redirects
31- $ metaTags = $ doc ->queryXPath ('//meta[ ' . static ::xpathContains ('@content ' , 'url ' ) . '] ' );
32- static ::removeNodes ($ metaTags );
33-
3468 // Remove data or JavaScript iFrames
3569 $ badIframes = $ doc ->queryXPath ('//*[ ' . static ::xpathContains ('@src ' , 'data: ' ) . '] | //*[ ' . static ::xpathContains ('@src ' , 'javascript: ' ) . '] | //*[@srcdoc] ' );
3670 static ::removeNodes ($ badIframes );
@@ -49,7 +83,10 @@ public static function removeActiveContentFromDocument(HtmlDocument $doc): void
4983 // Remove 'on*' attributes
5084 $ onAttributes = $ doc ->queryXPath ('//@*[starts-with(name(), \'on \')] ' );
5185 static ::removeAttributes ($ onAttributes );
86+ }
5287
88+ protected function filterOutFormElementsFromDocument (HtmlDocument $ doc ): void
89+ {
5390 // Remove form elements
5491 $ formElements = ['form ' , 'fieldset ' , 'button ' , 'textarea ' , 'select ' ];
5592 foreach ($ formElements as $ formElement ) {
@@ -75,41 +112,21 @@ public static function removeActiveContentFromDocument(HtmlDocument $doc): void
75112 }
76113 }
77114
78- /**
79- * Remove active content from the given HTML string.
80- * This aims to cover anything which can dynamically deal with, or send, data
81- * like any JavaScript actions or form content.
82- */
83- public static function removeActiveContentFromHtmlString (string $ html ): string
115+ protected function filterOutBadHtmlElementsFromDocument (HtmlDocument $ doc ): void
84116 {
85- if (empty ($ html )) {
86- return $ html ;
87- }
88-
89- $ doc = new HtmlDocument ($ html );
90- static ::removeActiveContentFromDocument ($ doc );
91-
92- return $ doc ->getBodyInnerHtml ();
93- }
94-
95- /**
96- * Alias using the old method name to avoid potential compatibility breaks during patch release.
97- * To remove in future feature release.
98- * @deprecated Use removeActiveContentFromDocument instead.
99- */
100- public static function removeScriptsFromDocument (HtmlDocument $ doc ): void
101- {
102- static ::removeActiveContentFromDocument ($ doc );
117+ // Remove meta tag to prevent external redirects
118+ $ metaTags = $ doc ->queryXPath ('//meta[ ' . static ::xpathContains ('@content ' , 'url ' ) . '] ' );
119+ static ::removeNodes ($ metaTags );
103120 }
104121
105- /**
106- * Alias using the old method name to avoid potential compatibility breaks during patch release.
107- * To remove in future feature release.
108- * @deprecated Use removeActiveContentFromHtmlString instead.
109- */
110- public static function removeScriptsFromHtmlString (string $ html ): string
122+ protected function filterOutNonContentElementsFromDocument (HtmlDocument $ doc ): void
111123 {
112- return static ::removeActiveContentFromHtmlString ($ html );
124+ // Remove non-content elements
125+ $ formElements = ['link ' , 'style ' , 'meta ' , 'title ' , 'template ' ];
126+ foreach ($ formElements as $ formElement ) {
127+ $ matchingFormElements = $ doc ->queryXPath ('// ' . $ formElement );
128+ static ::removeNodes ($ matchingFormElements );
129+ }
113130 }
114131
115132 /**
@@ -147,4 +164,34 @@ protected static function removeAttributes(DOMNodeList $attrs): void
147164 $ parentNode ->removeAttribute ($ attrName );
148165 }
149166 }
167+
168+ /**
169+ * Alias using the old method name to avoid potential compatibility breaks during patch release.
170+ * To remove in future feature release.
171+ * @deprecated Use filterDocument instead.
172+ */
173+ public static function removeScriptsFromDocument (HtmlDocument $ doc ): void
174+ {
175+ $ config = new HtmlContentFilterConfig (
176+ filterOutNonContentElements: false ,
177+ useAllowListFilter: false ,
178+ );
179+ $ filter = new static ($ config );
180+ $ filter ->filterDocument ($ doc );
181+ }
182+
183+ /**
184+ * Alias using the old method name to avoid potential compatibility breaks during patch release.
185+ * To remove in future feature release.
186+ * @deprecated Use filterString instead.
187+ */
188+ public static function removeScriptsFromHtmlString (string $ html ): string
189+ {
190+ $ config = new HtmlContentFilterConfig (
191+ filterOutNonContentElements: false ,
192+ useAllowListFilter: false ,
193+ );
194+ $ filter = new static ($ config );
195+ return $ filter ->filterString ($ html );
196+ }
150197}
0 commit comments