diff --git a/src/FSharp.Data.Html.Core/HtmlParser.fs b/src/FSharp.Data.Html.Core/HtmlParser.fs
index af3d59bec..12f72dd5f 100644
--- a/src/FSharp.Data.Html.Core/HtmlParser.fs
+++ b/src/FSharp.Data.Html.Core/HtmlParser.fs
@@ -77,16 +77,15 @@ module internal HtmlParser =
String(buffer)
type CharList =
- { mutable Contents: char list }
+ { mutable Contents: StringBuilder }
- static member Empty = { Contents = [] }
+ static member Empty = { Contents = StringBuilder() }
- override x.ToString() =
- String(x.Contents |> List.rev |> List.toArray)
+ override x.ToString() = x.Contents.ToString()
- member x.Cons(c) = x.Contents <- c :: x.Contents
+ member x.Cons(c: char) = x.Contents.Append(c) |> ignore
member x.Length = x.Contents.Length
- member x.Clear() = x.Contents <- []
+ member x.Clear() = x.Contents.Clear() |> ignore
type InsertionMode =
| DefaultMode
@@ -116,8 +115,8 @@ module internal HtmlParser =
static member Create(reader: TextReader) =
{ Attributes = []
- CurrentTag = CharList.Empty
- Content = CharList.Empty
+ CurrentTag = { Contents = StringBuilder() }
+ Content = { Contents = StringBuilder() }
HasFormattedParent = false
InsertionMode = DefaultMode
Tokens = []
@@ -133,7 +132,7 @@ module internal HtmlParser =
member x.ContentLength = x.Content.Length
member x.NewAttribute() =
- x.Attributes <- (CharList.Empty, CharList.Empty) :: x.Attributes
+ x.Attributes <- ({ Contents = StringBuilder() }, { Contents = StringBuilder() }) :: x.Attributes
member x.ConsAttrName() =
match x.Attributes with
@@ -170,7 +169,7 @@ module internal HtmlParser =
member x.EmitSelfClosingTag() =
let name = x.CurrentTag.ToString().Trim()
let result = Tag(true, name, x.GetAttributes())
- x.CurrentTag <- CharList.Empty
+ x.CurrentTag <- { Contents = StringBuilder() }
x.InsertionMode <- DefaultMode
x.Attributes <- []
x.Tokens <- result :: x.Tokens
@@ -212,7 +211,7 @@ module internal HtmlParser =
else
DefaultMode
- x.CurrentTag <- CharList.Empty
+ x.CurrentTag <- { Contents = StringBuilder() }
x.Attributes <- []
x.Tokens <- result :: x.Tokens
@@ -223,7 +222,7 @@ module internal HtmlParser =
for c in content.ToCharArray() do
x.ConsAttrValue c
- x.Content <- CharList.Empty
+ x.Content <- { Contents = StringBuilder() }
x.InsertionMode <- DefaultMode
member x.Emit() : unit =
@@ -247,7 +246,7 @@ module internal HtmlParser =
| DocTypeMode -> DocType content
| CDATAMode -> CData(content.Replace("", ""))
- x.Content <- CharList.Empty
+ x.Content <- { Contents = StringBuilder() }
x.InsertionMode <- DefaultMode
match result with
@@ -255,9 +254,9 @@ module internal HtmlParser =
| _ -> x.Tokens <- result :: x.Tokens
member x.Cons() = x.Content.Cons(x.Reader.ReadChar())
- member x.Cons(char) = x.Content.Cons(char)
- member x.Cons(char) = Array.iter (x.Content.Cons) char
- member x.Cons(char: string) = x.Cons(char.ToCharArray())
+ member x.Cons(char: char) = x.Content.Cons(char)
+ member x.Cons(chars: char array) = Array.iter (x.Content.Cons) chars
+ member x.Cons(chars: string) = x.Cons(chars.ToCharArray())
member x.ConsTag() =
match x.Reader.ReadChar() with
diff --git a/tests/FSharp.Data.Benchmarks/FSharp.Data.Benchmarks.fsproj b/tests/FSharp.Data.Benchmarks/FSharp.Data.Benchmarks.fsproj
index 57119c667..306e84886 100644
--- a/tests/FSharp.Data.Benchmarks/FSharp.Data.Benchmarks.fsproj
+++ b/tests/FSharp.Data.Benchmarks/FSharp.Data.Benchmarks.fsproj
@@ -14,6 +14,7 @@
PreserveNewest
+
diff --git a/tests/FSharp.Data.Benchmarks/HtmlBenchmarks.fs b/tests/FSharp.Data.Benchmarks/HtmlBenchmarks.fs
new file mode 100644
index 000000000..885e58f97
--- /dev/null
+++ b/tests/FSharp.Data.Benchmarks/HtmlBenchmarks.fs
@@ -0,0 +1,47 @@
+namespace FSharp.Data.Benchmarks
+
+open System
+open System.IO
+open BenchmarkDotNet.Attributes
+open FSharp.Data
+
+[]
+[]
+type HtmlBenchmarks() =
+
+ let mutable simpleHtmlText = ""
+ let mutable zooplaHtmlText = ""
+ let mutable usPresidentsHtmlText = ""
+ let mutable doctorWhoHtmlText = ""
+ let mutable wimbledonHtmlText = ""
+
+ []
+ member this.Setup() =
+ let dataPath = Path.Combine(__SOURCE_DIRECTORY__, "../FSharp.Data.Tests/Data")
+
+ // Load various HTML files of different sizes and complexity
+ simpleHtmlText <- File.ReadAllText(Path.Combine(dataPath, "SimpleHtmlTablesWithTr.html"))
+ zooplaHtmlText <- File.ReadAllText(Path.Combine(dataPath, "zoopla.html")) // ~773KB
+ usPresidentsHtmlText <- File.ReadAllText(Path.Combine(dataPath, "us_presidents_wikipedia.html")) // ~698KB
+ doctorWhoHtmlText <- File.ReadAllText(Path.Combine(dataPath, "doctor_who2.html")) // ~518KB
+ wimbledonHtmlText <- File.ReadAllText(Path.Combine(dataPath, "wimbledon_wikipedia.html")) // ~411KB
+
+ []
+ member this.ParseSimpleHtml() =
+ HtmlDocument.Parse(simpleHtmlText)
+
+ []
+ member this.ParseZooplaHtml() =
+ HtmlDocument.Parse(zooplaHtmlText)
+
+ []
+ member this.ParseUsPresidentsHtml() =
+ HtmlDocument.Parse(usPresidentsHtmlText)
+
+ []
+ member this.ParseDoctorWhoHtml() =
+ HtmlDocument.Parse(doctorWhoHtmlText)
+
+ []
+ member this.ParseWimbledonHtml() =
+ HtmlDocument.Parse(wimbledonHtmlText)
\ No newline at end of file