Skip to content

Commit 483317f

Browse files
github-actions[bot]Repo AssistCopilotdsyme
authored
Fix HTML parser dropping whitespace between inline elements (issue #1330) (#1630)
Add an InlineWhitespace token to the HTML tokeniser so that normalised whitespace-only DefaultMode text is tracked distinctly from real text content. In the tree builder, an InlineWhitespace token is turned into a HtmlText " " node only when its nearest previous and next tokens are both inline content; otherwise it is silently discarded. This preserves the significant inter-element space in: <span>Hello,</span> <span>World</span> -> "Hello, World" &lt; &gt; -> "< >" while still dropping insignificant inter-block whitespace (e.g. newlines between <head> and <body>), so existing tests continue to pass. Whitespace produced by character references (&#32;, &nbsp;, &Tab; ...) goes through the CharRefMode path and is never turned into InlineWhitespace, so it is never filtered. Three regression tests added covering the cases from the issue. Co-authored-by: Repo Assist <copilot@github.com> Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Co-authored-by: Don Syme <dsyme@users.noreply.github.com>
1 parent 73f6dfe commit 483317f

2 files changed

Lines changed: 121 additions & 1 deletion

File tree

src/FSharp.Data.Html.Core/HtmlParser.fs

Lines changed: 85 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ module internal HtmlParser =
4949
| Text of string
5050
| Comment of string
5151
| CData of string
52+
| InlineWhitespace // normalised whitespace-only DefaultMode text; kept only between inline siblings
5253
| EOF
5354

5455
override x.ToString() =
@@ -60,6 +61,7 @@ module internal HtmlParser =
6061
| Comment _ -> "comment"
6162
| EOF -> "eof"
6263
| CData _ -> "cdata"
64+
| InlineWhitespace -> "inlineWhitespace"
6365

6466
member x.IsEndTag name =
6567
match x with
@@ -237,7 +239,7 @@ module internal HtmlParser =
237239
let normalizedContent = wsRegex.Value.Replace(content, " ")
238240

239241
if normalizedContent = " " then
240-
Text ""
242+
InlineWhitespace // inter-element whitespace; kept only between inline siblings
241243
else
242244
Text normalizedContent
243245
| ScriptMode -> content |> Text
@@ -935,6 +937,68 @@ module internal HtmlParser =
935937

936938
state.Tokens |> List.rev
937939

940+
// Block-level HTML elements. Whitespace-only text nodes that are siblings of
941+
// these elements are inter-element whitespace and are insignificant.
942+
let private blockLevelElements =
943+
set
944+
[ "address"
945+
"article"
946+
"aside"
947+
"blockquote"
948+
"body"
949+
"caption"
950+
"col"
951+
"colgroup"
952+
"dd"
953+
"details"
954+
"dialog"
955+
"dir"
956+
"div"
957+
"dl"
958+
"dt"
959+
"fieldset"
960+
"figcaption"
961+
"figure"
962+
"footer"
963+
"form"
964+
"frameset"
965+
"h1"
966+
"h2"
967+
"h3"
968+
"h4"
969+
"h5"
970+
"h6"
971+
"head"
972+
"header"
973+
"hgroup"
974+
"html"
975+
"legend"
976+
"li"
977+
"link"
978+
"main"
979+
"menu"
980+
"meta"
981+
"nav"
982+
"noscript"
983+
"ol"
984+
"optgroup"
985+
"option"
986+
"p"
987+
"pre"
988+
"script"
989+
"section"
990+
"style"
991+
"summary"
992+
"table"
993+
"tbody"
994+
"td"
995+
"tfoot"
996+
"th"
997+
"thead"
998+
"title"
999+
"tr"
1000+
"ul" ]
1001+
9381002
let private parse reader =
9391003
let canNotHaveChildren (name: string) =
9401004
match name with
@@ -1050,6 +1114,26 @@ module internal HtmlParser =
10501114
// ignore this token if not the expected end tag (or it's reverse, eg: <li></il>)
10511115
parse' docType elements expectedTagEnd parentTagName rest
10521116
| TagEnd _ :: rest -> recursiveReturn (docType, rest, List.rev elements)
1117+
| InlineWhitespace :: rest ->
1118+
// This is normalised whitespace-only content from DefaultMode (e.g. the space
1119+
// between "</span> <span>"). Keep it as a space text node only when BOTH the
1120+
// previous accumulated node and the next token represent inline content.
1121+
let prevIsInline =
1122+
match elements with
1123+
| HtmlNode.HtmlElement(name, _, _) :: _ -> not (Set.contains name blockLevelElements)
1124+
| HtmlNode.HtmlText t :: _ -> not (String.IsNullOrWhiteSpace t)
1125+
| _ -> false
1126+
1127+
let nextIsInline =
1128+
match rest with
1129+
| Text t :: _ when t <> "" -> true
1130+
| Tag(_, name, _) :: _ -> not (Set.contains name blockLevelElements)
1131+
| _ -> false
1132+
1133+
if prevIsInline && nextIsInline then
1134+
parse' docType (HtmlNode.HtmlText " " :: elements) expectedTagEnd parentTagName rest
1135+
else
1136+
parse' docType elements expectedTagEnd parentTagName rest
10531137
| Text a :: Text b :: rest ->
10541138
if a = "" && b = "" then
10551139
// ignore this token

tests/FSharp.Data.Core.Tests/HtmlParser.fs

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -983,3 +983,39 @@ let ``Can handle incomplete tags at end of file without creating an infinite loo
983983
("html",
984984
[ HtmlNode.NewElement("head")])]
985985
result |> should equal expected
986+
987+
[<Test>]
988+
let ``Preserves space between entity references in inline content (issue 1330)``() =
989+
// &lt; &gt; — the space between the two entity refs must survive InnerText()
990+
let result =
991+
HtmlDocument.Parse "<p>&lt; &gt;</p>"
992+
|> HtmlDocument.descendantsNamed true [ "p" ]
993+
|> Seq.head
994+
|> HtmlNode.innerText
995+
result |> should equal "< >"
996+
997+
[<Test>]
998+
let ``Preserves space between adjacent inline elements (issue 1330)``() =
999+
// A space between two <span> siblings must not be dropped
1000+
let result =
1001+
HtmlDocument.Parse "<div><span>Hello,</span> <span>World</span></div>"
1002+
|> HtmlDocument.descendantsNamed true [ "div" ]
1003+
|> Seq.head
1004+
|> HtmlNode.innerText
1005+
result |> should equal "Hello, World"
1006+
1007+
[<Test>]
1008+
let ``Drops inter-block whitespace but keeps inline whitespace``() =
1009+
// Whitespace between block siblings (<li>) is dropped; whitespace
1010+
// between inline siblings (<span>) inside a <li> is kept.
1011+
let html =
1012+
"""<ul>
1013+
<li><span>A</span> <span>B</span></li>
1014+
<li>C</li>
1015+
</ul>"""
1016+
let result =
1017+
HtmlDocument.Parse html
1018+
|> HtmlDocument.descendantsNamed true [ "li" ]
1019+
|> Seq.map HtmlNode.innerText
1020+
|> Seq.toList
1021+
result |> should equal [ "A B"; "C" ]

0 commit comments

Comments
 (0)