Skip to content

Commit 619a203

Browse files
committed
Avoid parsing the whole file when title is only needed.
1 parent 84869a9 commit 619a203

1 file changed

Lines changed: 19 additions & 8 deletions

File tree

converter/generator/DocTransformer.cs

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
using System.Net;
22
using System.Text.Json;
3+
using System.Text.RegularExpressions;
34
using System.Xml.Linq;
45
using AngleSharp.Common;
56
using AngleSharp.Dom;
@@ -11,7 +12,7 @@
1112

1213
namespace OriginLab.DocumentGeneration;
1314

14-
internal abstract class DocTransformer
15+
internal abstract partial class DocTransformer
1516
{
1617
protected string SourceFolder { get; }
1718
protected string SourceFolderEn { get; }
@@ -148,14 +149,21 @@ protected void Transform(string sourceFile, string destinationFile, string langu
148149

149150
protected static string GetPageTitle(string sourceFile)
150151
{
151-
using var fs = new FileStream(sourceFile, FileMode.Open, FileAccess.Read);
152-
var parser = new HtmlParser();
153-
var document = parser.ParseDocument(fs);
152+
using var reader = new StreamReader(sourceFile);
154153

155-
return GetFirstHeading(document);
156-
}
154+
while (reader.ReadLine() is string line)
155+
{
156+
if (HeaderRegex.Match(line) is { Success: true } match)
157+
{
158+
var parser = new HtmlParser();
159+
var doc = parser.ParseDocument(match.Value);
157160

158-
private static string GetFirstHeading(IHtmlDocument document) => document.QuerySelector("h1")?.Text() ?? "";
161+
return doc.QuerySelector("h1")!.Text();
162+
}
163+
}
164+
165+
return "";
166+
}
159167

160168
void Transform(IHtmlDocument document, string sourceFile, string language, in Nav nav, INodeList? headerNodes, INodeList? bannerNodes, INodeList? footerNodes)
161169
{
@@ -211,7 +219,7 @@ void Transform(IHtmlDocument document, string sourceFile, string language, in Na
211219

212220
private static void CleanUp(IHtmlDocument document)
213221
{
214-
document.Title = GetFirstHeading(document);
222+
document.Title = document.QuerySelector("h1")?.Text() ?? "";
215223

216224
document.QuerySelectorAll<IHtmlSpanElement>("span.mw-editsection").Remove();
217225
}
@@ -517,4 +525,7 @@ public void PrintProblems()
517525
}
518526
}
519527
}
528+
529+
[GeneratedRegex(@"<h1[^>]*>.*?</h1>")]
530+
private static partial Regex HeaderRegex { get; }
520531
}

0 commit comments

Comments
 (0)