diff --git a/app/Activity/Models/Comment.php b/app/Activity/Models/Comment.php index ab7d917729c..3faa76657b6 100644 --- a/app/Activity/Models/Comment.php +++ b/app/Activity/Models/Comment.php @@ -9,6 +9,7 @@ use BookStack\Users\Models\OwnableInterface; use BookStack\Util\HtmlContentFilter; use BookStack\Util\HtmlContentFilterConfig; +use BookStack\Util\HtmlToPlainText; use Illuminate\Database\Eloquent\Builder; use Illuminate\Database\Eloquent\Factories\HasFactory; use Illuminate\Database\Eloquent\Relations\BelongsTo; @@ -87,6 +88,12 @@ public function safeHtml(): string return $filter->filterString($this->html ?? ''); } + public function getPlainText(): string + { + $converter = new HtmlToPlainText(); + return $converter->convert($this->html ?? ''); + } + public function jointPermissions(): HasMany { return $this->hasMany(JointPermission::class, 'entity_id', 'commentable_id') diff --git a/app/Activity/Notifications/Messages/CommentCreationNotification.php b/app/Activity/Notifications/Messages/CommentCreationNotification.php index 30d0ffa2be4..d739f4aabbf 100644 --- a/app/Activity/Notifications/Messages/CommentCreationNotification.php +++ b/app/Activity/Notifications/Messages/CommentCreationNotification.php @@ -24,7 +24,7 @@ public function toMail(User $notifiable): MailMessage $locale->trans('notifications.detail_page_name') => new EntityLinkMessageLine($page), $locale->trans('notifications.detail_page_path') => $this->buildPagePathLine($page, $notifiable), $locale->trans('notifications.detail_commenter') => $this->user->name, - $locale->trans('notifications.detail_comment') => strip_tags($comment->html), + $locale->trans('notifications.detail_comment') => $comment->getPlainText(), ]); return $this->newMailMessage($locale) diff --git a/app/Activity/Notifications/Messages/CommentMentionNotification.php b/app/Activity/Notifications/Messages/CommentMentionNotification.php index de9e719633d..4c8ee5bab8b 100644 --- a/app/Activity/Notifications/Messages/CommentMentionNotification.php +++ b/app/Activity/Notifications/Messages/CommentMentionNotification.php @@ -24,7 +24,7 @@ public function toMail(User $notifiable): MailMessage $locale->trans('notifications.detail_page_name') => new EntityLinkMessageLine($page), $locale->trans('notifications.detail_page_path') => $this->buildPagePathLine($page, $notifiable), $locale->trans('notifications.detail_commenter') => $this->user->name, - $locale->trans('notifications.detail_comment') => strip_tags($comment->html), + $locale->trans('notifications.detail_comment') => $comment->getPlainText(), ]); return $this->newMailMessage($locale) diff --git a/app/Entities/Repos/BaseRepo.php b/app/Entities/Repos/BaseRepo.php index 717e9c9f82a..44baeaccfdc 100644 --- a/app/Entities/Repos/BaseRepo.php +++ b/app/Entities/Repos/BaseRepo.php @@ -16,6 +16,7 @@ use BookStack\Sorting\BookSorter; use BookStack\Uploads\ImageRepo; use BookStack\Util\HtmlDescriptionFilter; +use BookStack\Util\HtmlToPlainText; use Illuminate\Http\UploadedFile; class BaseRepo @@ -151,9 +152,10 @@ protected function updateDescription(Entity $entity, array $input): void } if (isset($input['description_html'])) { + $plainTextConverter = new HtmlToPlainText(); $entity->descriptionInfo()->set( HtmlDescriptionFilter::filterFromString($input['description_html']), - html_entity_decode(strip_tags($input['description_html'])) + $plainTextConverter->convert($input['description_html']), ); } else if (isset($input['description'])) { $entity->descriptionInfo()->set('', $input['description']); diff --git a/app/Entities/Tools/PageContent.php b/app/Entities/Tools/PageContent.php index 8d89a86cff4..b86fbbe8bdd 100644 --- a/app/Entities/Tools/PageContent.php +++ b/app/Entities/Tools/PageContent.php @@ -16,6 +16,7 @@ use BookStack\Util\HtmlContentFilter; use BookStack\Util\HtmlContentFilterConfig; use BookStack\Util\HtmlDocument; +use BookStack\Util\HtmlToPlainText; use BookStack\Util\WebSafeMimeSniffer; use Closure; use DOMElement; @@ -303,8 +304,8 @@ protected function setUniqueId(DOMNode $element, array &$idMap): array public function toPlainText(): string { $html = $this->render(true); - - return html_entity_decode(strip_tags($html)); + $converter = new HtmlToPlainText(); + return $converter->convert($html); } /** diff --git a/app/Exports/ExportFormatter.php b/app/Exports/ExportFormatter.php index c5973eace29..dec8aa23d8f 100644 --- a/app/Exports/ExportFormatter.php +++ b/app/Exports/ExportFormatter.php @@ -11,6 +11,7 @@ use BookStack\Uploads\ImageService; use BookStack\Util\CspService; use BookStack\Util\HtmlDocument; +use BookStack\Util\HtmlToPlainText; use DOMElement; use Exception; use Throwable; @@ -242,24 +243,13 @@ protected function containHtml(string $htmlContent): string /** * Converts the page contents into simple plain text. - * This method filters any bad looking content to provide a nice final output. + * We re-generate the plain text from HTML at this point, post-page-content rendering. */ public function pageToPlainText(Page $page, bool $pageRendered = false, bool $fromParent = false): string { $html = $pageRendered ? $page->html : (new PageContent($page))->render(); - // Add proceeding spaces before tags so spaces remain between - // text within elements after stripping tags. - $html = str_replace('<', " <", $html); - $text = trim(strip_tags($html)); - // Replace multiple spaces with single spaces - $text = preg_replace('/ {2,}/', ' ', $text); - // Reduce multiple horrid whitespace characters. - $text = preg_replace('/(\x0A|\xA0|\x0A|\r|\n){2,}/su', "\n\n", $text); - $text = html_entity_decode($text); - // Add title - $text = $page->name . ($fromParent ? "\n" : "\n\n") . $text; - - return $text; + $contentText = (new HtmlToPlainText())->convert($html); + return $page->name . ($fromParent ? "\n" : "\n\n") . $contentText; } /** @@ -267,7 +257,7 @@ public function pageToPlainText(Page $page, bool $pageRendered = false, bool $fr */ public function chapterToPlainText(Chapter $chapter): string { - $text = $chapter->name . "\n" . $chapter->description; + $text = $chapter->name . "\n" . $chapter->descriptionInfo()->getPlain(); $text = trim($text) . "\n\n"; $parts = []; diff --git a/app/Util/HtmlToPlainText.php b/app/Util/HtmlToPlainText.php new file mode 100644 index 00000000000..79da9e3d862 --- /dev/null +++ b/app/Util/HtmlToPlainText.php @@ -0,0 +1,47 @@ +nodeToText($doc->getBody()); + + // Remove repeated newlines + $text = preg_replace('/\n+/', "\n", $text); + // Remove leading/trailing whitespace + $text = trim($text); + + return $text; + } + + protected function nodeToText(\DOMNode $node): string + { + if ($node->nodeType === XML_TEXT_NODE) { + return $node->textContent; + } + + $text = ''; + if (!in_array($node->nodeName, $this->inlineTags)) { + $text .= "\n"; + } + + foreach ($node->childNodes as $childNode) { + $text .= $this->nodeToText($childNode); + } + + return $text; + } +} diff --git a/tests/Exports/TextExportTest.php b/tests/Exports/TextExportTest.php index 4b2d6288775..26298c185da 100644 --- a/tests/Exports/TextExportTest.php +++ b/tests/Exports/TextExportTest.php @@ -52,7 +52,7 @@ public function test_book_text_export_format() $resp = $this->asEditor()->get($entities['book']->getUrl('/export/plaintext')); $expected = "Export Book\nThis is a book with stuff to export\n\nExport chapter\nA test chapter to be exported\nIt has loads of info within\n\n"; - $expected .= "My wonderful page!\nMy great page Full of great stuff"; + $expected .= "My wonderful page!\nMy great page\nFull of great stuff"; $resp->assertSee($expected); } @@ -82,7 +82,7 @@ public function test_chapter_text_export_format() $resp = $this->asEditor()->get($entities['book']->getUrl('/export/plaintext')); $expected = "Export chapter\nA test chapter to be exported\nIt has loads of info within\n\n"; - $expected .= "My wonderful page!\nMy great page Full of great stuff"; + $expected .= "My wonderful page!\nMy great page\nFull of great stuff"; $resp->assertSee($expected); } } diff --git a/tests/Util/HtmlToPlainTextTest.php b/tests/Util/HtmlToPlainTextTest.php new file mode 100644 index 00000000000..e522e486360 --- /dev/null +++ b/tests/Util/HtmlToPlainTextTest.php @@ -0,0 +1,63 @@ +This is a test
+more <©> text with bold
+HTML; + $expected = <<