Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 15 additions & 16 deletions src/FSharp.Data.Html.Core/HtmlParser.fs
Original file line number Diff line number Diff line change
Expand Up @@ -77,16 +77,15 @@ module internal HtmlParser =
String(buffer)

type CharList =
{ mutable Contents: char list }
{ mutable Contents: StringBuilder }

static member Empty = { Contents = [] }
static member Empty = { Contents = StringBuilder() }

override x.ToString() =
String(x.Contents |> List.rev |> List.toArray)
override x.ToString() = x.Contents.ToString()

member x.Cons(c) = x.Contents <- c :: x.Contents
member x.Cons(c: char) = x.Contents.Append(c) |> ignore
member x.Length = x.Contents.Length
member x.Clear() = x.Contents <- []
member x.Clear() = x.Contents.Clear() |> ignore

type InsertionMode =
| DefaultMode
Expand Down Expand Up @@ -116,8 +115,8 @@ module internal HtmlParser =

static member Create(reader: TextReader) =
{ Attributes = []
CurrentTag = CharList.Empty
Content = CharList.Empty
CurrentTag = { Contents = StringBuilder() }
Content = { Contents = StringBuilder() }
HasFormattedParent = false
InsertionMode = DefaultMode
Tokens = []
Expand All @@ -133,7 +132,7 @@ module internal HtmlParser =
member x.ContentLength = x.Content.Length

member x.NewAttribute() =
x.Attributes <- (CharList.Empty, CharList.Empty) :: x.Attributes
x.Attributes <- ({ Contents = StringBuilder() }, { Contents = StringBuilder() }) :: x.Attributes

member x.ConsAttrName() =
match x.Attributes with
Expand Down Expand Up @@ -170,7 +169,7 @@ module internal HtmlParser =
member x.EmitSelfClosingTag() =
let name = x.CurrentTag.ToString().Trim()
let result = Tag(true, name, x.GetAttributes())
x.CurrentTag <- CharList.Empty
x.CurrentTag <- { Contents = StringBuilder() }
x.InsertionMode <- DefaultMode
x.Attributes <- []
x.Tokens <- result :: x.Tokens
Expand Down Expand Up @@ -212,7 +211,7 @@ module internal HtmlParser =
else
DefaultMode

x.CurrentTag <- CharList.Empty
x.CurrentTag <- { Contents = StringBuilder() }
x.Attributes <- []
x.Tokens <- result :: x.Tokens

Expand All @@ -223,7 +222,7 @@ module internal HtmlParser =
for c in content.ToCharArray() do
x.ConsAttrValue c

x.Content <- CharList.Empty
x.Content <- { Contents = StringBuilder() }
x.InsertionMode <- DefaultMode

member x.Emit() : unit =
Expand All @@ -247,17 +246,17 @@ module internal HtmlParser =
| DocTypeMode -> DocType content
| CDATAMode -> CData(content.Replace("<![CDATA[", "").Replace("]]>", ""))

x.Content <- CharList.Empty
x.Content <- { Contents = StringBuilder() }
x.InsertionMode <- DefaultMode

match result with
| Text t when String.IsNullOrEmpty(t) -> ()
| _ -> x.Tokens <- result :: x.Tokens

member x.Cons() = x.Content.Cons(x.Reader.ReadChar())
member x.Cons(char) = x.Content.Cons(char)
member x.Cons(char) = Array.iter (x.Content.Cons) char
member x.Cons(char: string) = x.Cons(char.ToCharArray())
member x.Cons(char: char) = x.Content.Cons(char)
member x.Cons(chars: char array) = Array.iter (x.Content.Cons) chars
member x.Cons(chars: string) = x.Cons(chars.ToCharArray())

member x.ConsTag() =
match x.Reader.ReadChar() with
Expand Down
1 change: 1 addition & 0 deletions tests/FSharp.Data.Benchmarks/FSharp.Data.Benchmarks.fsproj
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</EmbeddedResource>
<Compile Include="JsonBenchmarks.fs" />
<Compile Include="HtmlBenchmarks.fs" />
<Compile Include="Program.fs" />
</ItemGroup>
<ItemGroup>
Expand Down
47 changes: 47 additions & 0 deletions tests/FSharp.Data.Benchmarks/HtmlBenchmarks.fs
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
namespace FSharp.Data.Benchmarks

open System
open System.IO
open BenchmarkDotNet.Attributes
open FSharp.Data

[<MemoryDiagnoser>]
[<SimpleJob>]
type HtmlBenchmarks() =

let mutable simpleHtmlText = ""
let mutable zooplaHtmlText = ""
let mutable usPresidentsHtmlText = ""
let mutable doctorWhoHtmlText = ""
let mutable wimbledonHtmlText = ""

[<GlobalSetup>]
member this.Setup() =
let dataPath = Path.Combine(__SOURCE_DIRECTORY__, "../FSharp.Data.Tests/Data")

// Load various HTML files of different sizes and complexity
simpleHtmlText <- File.ReadAllText(Path.Combine(dataPath, "SimpleHtmlTablesWithTr.html"))
zooplaHtmlText <- File.ReadAllText(Path.Combine(dataPath, "zoopla.html")) // ~773KB
usPresidentsHtmlText <- File.ReadAllText(Path.Combine(dataPath, "us_presidents_wikipedia.html")) // ~698KB
doctorWhoHtmlText <- File.ReadAllText(Path.Combine(dataPath, "doctor_who2.html")) // ~518KB
wimbledonHtmlText <- File.ReadAllText(Path.Combine(dataPath, "wimbledon_wikipedia.html")) // ~411KB

[<Benchmark>]
member this.ParseSimpleHtml() =
HtmlDocument.Parse(simpleHtmlText)

[<Benchmark>]
member this.ParseZooplaHtml() =
HtmlDocument.Parse(zooplaHtmlText)

[<Benchmark>]
member this.ParseUsPresidentsHtml() =
HtmlDocument.Parse(usPresidentsHtmlText)

[<Benchmark>]
member this.ParseDoctorWhoHtml() =
HtmlDocument.Parse(doctorWhoHtmlText)

[<Benchmark>]
member this.ParseWimbledonHtml() =
HtmlDocument.Parse(wimbledonHtmlText)
Loading