diff --git a/src/FSharp.Data.Html.Core/HtmlParser.fs b/src/FSharp.Data.Html.Core/HtmlParser.fs index af3d59bec..12f72dd5f 100644 --- a/src/FSharp.Data.Html.Core/HtmlParser.fs +++ b/src/FSharp.Data.Html.Core/HtmlParser.fs @@ -77,16 +77,15 @@ module internal HtmlParser = String(buffer) type CharList = - { mutable Contents: char list } + { mutable Contents: StringBuilder } - static member Empty = { Contents = [] } + static member Empty = { Contents = StringBuilder() } - override x.ToString() = - String(x.Contents |> List.rev |> List.toArray) + override x.ToString() = x.Contents.ToString() - member x.Cons(c) = x.Contents <- c :: x.Contents + member x.Cons(c: char) = x.Contents.Append(c) |> ignore member x.Length = x.Contents.Length - member x.Clear() = x.Contents <- [] + member x.Clear() = x.Contents.Clear() |> ignore type InsertionMode = | DefaultMode @@ -116,8 +115,8 @@ module internal HtmlParser = static member Create(reader: TextReader) = { Attributes = [] - CurrentTag = CharList.Empty - Content = CharList.Empty + CurrentTag = { Contents = StringBuilder() } + Content = { Contents = StringBuilder() } HasFormattedParent = false InsertionMode = DefaultMode Tokens = [] @@ -133,7 +132,7 @@ module internal HtmlParser = member x.ContentLength = x.Content.Length member x.NewAttribute() = - x.Attributes <- (CharList.Empty, CharList.Empty) :: x.Attributes + x.Attributes <- ({ Contents = StringBuilder() }, { Contents = StringBuilder() }) :: x.Attributes member x.ConsAttrName() = match x.Attributes with @@ -170,7 +169,7 @@ module internal HtmlParser = member x.EmitSelfClosingTag() = let name = x.CurrentTag.ToString().Trim() let result = Tag(true, name, x.GetAttributes()) - x.CurrentTag <- CharList.Empty + x.CurrentTag <- { Contents = StringBuilder() } x.InsertionMode <- DefaultMode x.Attributes <- [] x.Tokens <- result :: x.Tokens @@ -212,7 +211,7 @@ module internal HtmlParser = else DefaultMode - x.CurrentTag <- CharList.Empty + x.CurrentTag <- { Contents = StringBuilder() } x.Attributes <- [] x.Tokens <- result :: x.Tokens @@ -223,7 +222,7 @@ module internal HtmlParser = for c in content.ToCharArray() do x.ConsAttrValue c - x.Content <- CharList.Empty + x.Content <- { Contents = StringBuilder() } x.InsertionMode <- DefaultMode member x.Emit() : unit = @@ -247,7 +246,7 @@ module internal HtmlParser = | DocTypeMode -> DocType content | CDATAMode -> CData(content.Replace("", "")) - x.Content <- CharList.Empty + x.Content <- { Contents = StringBuilder() } x.InsertionMode <- DefaultMode match result with @@ -255,9 +254,9 @@ module internal HtmlParser = | _ -> x.Tokens <- result :: x.Tokens member x.Cons() = x.Content.Cons(x.Reader.ReadChar()) - member x.Cons(char) = x.Content.Cons(char) - member x.Cons(char) = Array.iter (x.Content.Cons) char - member x.Cons(char: string) = x.Cons(char.ToCharArray()) + member x.Cons(char: char) = x.Content.Cons(char) + member x.Cons(chars: char array) = Array.iter (x.Content.Cons) chars + member x.Cons(chars: string) = x.Cons(chars.ToCharArray()) member x.ConsTag() = match x.Reader.ReadChar() with diff --git a/tests/FSharp.Data.Benchmarks/FSharp.Data.Benchmarks.fsproj b/tests/FSharp.Data.Benchmarks/FSharp.Data.Benchmarks.fsproj index 57119c667..306e84886 100644 --- a/tests/FSharp.Data.Benchmarks/FSharp.Data.Benchmarks.fsproj +++ b/tests/FSharp.Data.Benchmarks/FSharp.Data.Benchmarks.fsproj @@ -14,6 +14,7 @@ PreserveNewest + diff --git a/tests/FSharp.Data.Benchmarks/HtmlBenchmarks.fs b/tests/FSharp.Data.Benchmarks/HtmlBenchmarks.fs new file mode 100644 index 000000000..885e58f97 --- /dev/null +++ b/tests/FSharp.Data.Benchmarks/HtmlBenchmarks.fs @@ -0,0 +1,47 @@ +namespace FSharp.Data.Benchmarks + +open System +open System.IO +open BenchmarkDotNet.Attributes +open FSharp.Data + +[] +[] +type HtmlBenchmarks() = + + let mutable simpleHtmlText = "" + let mutable zooplaHtmlText = "" + let mutable usPresidentsHtmlText = "" + let mutable doctorWhoHtmlText = "" + let mutable wimbledonHtmlText = "" + + [] + member this.Setup() = + let dataPath = Path.Combine(__SOURCE_DIRECTORY__, "../FSharp.Data.Tests/Data") + + // Load various HTML files of different sizes and complexity + simpleHtmlText <- File.ReadAllText(Path.Combine(dataPath, "SimpleHtmlTablesWithTr.html")) + zooplaHtmlText <- File.ReadAllText(Path.Combine(dataPath, "zoopla.html")) // ~773KB + usPresidentsHtmlText <- File.ReadAllText(Path.Combine(dataPath, "us_presidents_wikipedia.html")) // ~698KB + doctorWhoHtmlText <- File.ReadAllText(Path.Combine(dataPath, "doctor_who2.html")) // ~518KB + wimbledonHtmlText <- File.ReadAllText(Path.Combine(dataPath, "wimbledon_wikipedia.html")) // ~411KB + + [] + member this.ParseSimpleHtml() = + HtmlDocument.Parse(simpleHtmlText) + + [] + member this.ParseZooplaHtml() = + HtmlDocument.Parse(zooplaHtmlText) + + [] + member this.ParseUsPresidentsHtml() = + HtmlDocument.Parse(usPresidentsHtmlText) + + [] + member this.ParseDoctorWhoHtml() = + HtmlDocument.Parse(doctorWhoHtmlText) + + [] + member this.ParseWimbledonHtml() = + HtmlDocument.Parse(wimbledonHtmlText) \ No newline at end of file