Skip to content

Commit b234d6f

Browse files
committed
Html convertor converts content inside style tag in header #11
1 parent c1ea810 commit b234d6f

File tree

4 files changed

+184
-8
lines changed

4 files changed

+184
-8
lines changed

Directory.Build.props

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@
2121
<PackageLicenseExpression>MIT</PackageLicenseExpression>
2222
<PackageReadmeFile>README.md</PackageReadmeFile>
2323
<Product>Managed Code - MarkItDown</Product>
24-
<Version>0.0.2</Version>
25-
<PackageVersion>0.0.2</PackageVersion>
24+
<Version>0.0.3</Version>
25+
<PackageVersion>0.0.3</PackageVersion>
2626
</PropertyGroup>
2727

2828
<PropertyGroup Condition="'$(GITHUB_ACTIONS)' == 'true'">

src/MarkItDown/Converters/HtmlMarkdownRenderer.cs

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,43 @@
11
using AngleSharp.Dom;
22
using AngleSharp.Html.Dom;
3+
using System.Collections.Generic;
34
using System.Linq;
45
using System.Text;
56

67
namespace MarkItDown.Converters;
78

89
internal sealed class HtmlMarkdownRenderer
910
{
11+
// Tags that do not produce visible Markdown output and are safe to drop entirely.
12+
private static readonly HashSet<string> NonRenderableElements = new(StringComparer.OrdinalIgnoreCase)
13+
{
14+
"style",
15+
"script",
16+
"noscript",
17+
"template",
18+
"head",
19+
"meta",
20+
"link",
21+
"base",
22+
"title",
23+
"iframe",
24+
"frame",
25+
"frameset",
26+
"object",
27+
"embed",
28+
"param",
29+
"source",
30+
"track",
31+
};
32+
1033
public HtmlRenderResult RenderDocument(IHtmlDocument document)
1134
{
1235
var markdown = new StringBuilder();
13-
ConvertNode(document.DocumentElement, markdown, 0);
36+
var root = document.Body ?? document.DocumentElement;
37+
if (root is not null)
38+
{
39+
ConvertNode(root, markdown, 0);
40+
}
1441
var normalized = NormalizeSpacing(markdown.ToString());
1542
var title = ExtractTitle(document);
1643
return new HtmlRenderResult(normalized, title);
@@ -71,6 +98,9 @@ private static void ConvertNode(INode? node, StringBuilder markdown, int indentL
7198
}
7299
break;
73100

101+
case NodeType.Comment:
102+
break;
103+
74104
case NodeType.Text:
75105
AppendText(node, markdown);
76106
break;
@@ -97,6 +127,11 @@ private static void ConvertElement(IElement element, StringBuilder markdown, int
97127
{
98128
var tag = element.TagName.ToLowerInvariant();
99129

130+
if (NonRenderableElements.Contains(tag))
131+
{
132+
return;
133+
}
134+
100135
switch (tag)
101136
{
102137
case "h1":

tests/MarkItDown.Tests/HtmlConverterTests.cs

Lines changed: 146 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,151 @@ public async Task ConvertAsync_SimpleHtml_ReturnsCorrectMarkdown()
100100
Assert.Equal("Test Page", result.Title);
101101
}
102102

103+
[Fact]
104+
public async Task ConvertAsync_HtmlWithStyleElement_IgnoresStyleContent()
105+
{
106+
// Arrange
107+
var converter = new HtmlConverter();
108+
var html = "<html><head><style> h1 {color: red;} </style></head><body><h1>Test</h1></body></html>";
109+
var bytes = Encoding.UTF8.GetBytes(html);
110+
using var stream = new MemoryStream(bytes);
111+
var streamInfo = new StreamInfo(charset: Encoding.UTF8);
112+
113+
// Act
114+
var result = await converter.ConvertAsync(stream, streamInfo);
115+
116+
// Assert
117+
Assert.Contains("# Test", result.Markdown);
118+
Assert.DoesNotContain("h1 {color: red;}", result.Markdown);
119+
}
120+
121+
[Fact]
122+
public async Task ConvertAsync_HtmlWithBodyStyleElement_IgnoresStyleContent()
123+
{
124+
// Arrange
125+
var converter = new HtmlConverter();
126+
var html = "<html><body><style> body {background: blue;} </style><h1>Inline</h1></body></html>";
127+
var bytes = Encoding.UTF8.GetBytes(html);
128+
using var stream = new MemoryStream(bytes);
129+
var streamInfo = new StreamInfo(charset: Encoding.UTF8);
130+
131+
// Act
132+
var result = await converter.ConvertAsync(stream, streamInfo);
133+
134+
// Assert
135+
Assert.Contains("# Inline", result.Markdown);
136+
Assert.DoesNotContain("body {background: blue;}", result.Markdown);
137+
}
138+
139+
[Fact]
140+
public async Task ConvertAsync_HtmlWithScriptElement_IgnoresScriptContent()
141+
{
142+
// Arrange
143+
var converter = new HtmlConverter();
144+
var html = "<html><head><script>console.log('test');</script></head><body><h1>Run</h1><script>console.log('body');</script></body></html>";
145+
var bytes = Encoding.UTF8.GetBytes(html);
146+
using var stream = new MemoryStream(bytes);
147+
var streamInfo = new StreamInfo(charset: Encoding.UTF8);
148+
149+
// Act
150+
var result = await converter.ConvertAsync(stream, streamInfo);
151+
152+
// Assert
153+
Assert.Contains("# Run", result.Markdown);
154+
Assert.DoesNotContain("console.log('test');", result.Markdown);
155+
Assert.DoesNotContain("console.log('body');", result.Markdown);
156+
}
157+
158+
[Fact]
159+
public async Task ConvertAsync_HtmlWithHeadLink_IgnoresLinkMetadata()
160+
{
161+
// Arrange
162+
var converter = new HtmlConverter();
163+
var html = "<html><head><link rel=\"stylesheet\" href=\"styles.css\"></head><body><h1>Linked</h1></body></html>";
164+
var bytes = Encoding.UTF8.GetBytes(html);
165+
using var stream = new MemoryStream(bytes);
166+
var streamInfo = new StreamInfo(charset: Encoding.UTF8);
167+
168+
// Act
169+
var result = await converter.ConvertAsync(stream, streamInfo);
170+
171+
// Assert
172+
Assert.Contains("# Linked", result.Markdown);
173+
Assert.DoesNotContain("styles.css", result.Markdown);
174+
}
175+
176+
[Fact]
177+
public async Task ConvertAsync_HtmlWithAnchor_RendersMarkdownLink()
178+
{
179+
// Arrange
180+
var converter = new HtmlConverter();
181+
var html = "<html><head><link rel=\"stylesheet\" href=\"styles.css\"></head><body><p>Visit <a href=\"https://example.com\">Example</a></p></body></html>";
182+
var bytes = Encoding.UTF8.GetBytes(html);
183+
using var stream = new MemoryStream(bytes);
184+
var streamInfo = new StreamInfo(charset: Encoding.UTF8);
185+
186+
// Act
187+
var result = await converter.ConvertAsync(stream, streamInfo);
188+
189+
// Assert
190+
Assert.Contains("[Example](https://example.com)", result.Markdown);
191+
Assert.DoesNotContain("styles.css", result.Markdown);
192+
}
193+
194+
[Fact]
195+
public async Task ConvertAsync_HtmlWithNoscript_IgnoresNoscriptContent()
196+
{
197+
// Arrange
198+
var converter = new HtmlConverter();
199+
var html = "<html><body><noscript><p>Enable JavaScript</p></noscript><h1>Visible</h1></body></html>";
200+
var bytes = Encoding.UTF8.GetBytes(html);
201+
using var stream = new MemoryStream(bytes);
202+
var streamInfo = new StreamInfo(charset: Encoding.UTF8);
203+
204+
// Act
205+
var result = await converter.ConvertAsync(stream, streamInfo);
206+
207+
// Assert
208+
Assert.Contains("# Visible", result.Markdown);
209+
Assert.DoesNotContain("Enable JavaScript", result.Markdown);
210+
}
211+
212+
[Fact]
213+
public async Task ConvertAsync_HtmlWithTemplate_IgnoresTemplateContent()
214+
{
215+
// Arrange
216+
var converter = new HtmlConverter();
217+
var html = "<html><body><template><p>Hidden Content</p></template><h1>Shown</h1></body></html>";
218+
var bytes = Encoding.UTF8.GetBytes(html);
219+
using var stream = new MemoryStream(bytes);
220+
var streamInfo = new StreamInfo(charset: Encoding.UTF8);
221+
222+
// Act
223+
var result = await converter.ConvertAsync(stream, streamInfo);
224+
225+
// Assert
226+
Assert.Contains("# Shown", result.Markdown);
227+
Assert.DoesNotContain("Hidden Content", result.Markdown);
228+
}
229+
230+
[Fact]
231+
public async Task ConvertAsync_HtmlWithIframe_IgnoresIframeContent()
232+
{
233+
// Arrange
234+
var converter = new HtmlConverter();
235+
var html = "<html><body><iframe src=\"https://example.com\">Fallback Text</iframe><h1>Visible</h1></body></html>";
236+
var bytes = Encoding.UTF8.GetBytes(html);
237+
using var stream = new MemoryStream(bytes);
238+
var streamInfo = new StreamInfo(charset: Encoding.UTF8);
239+
240+
// Act
241+
var result = await converter.ConvertAsync(stream, streamInfo);
242+
243+
// Assert
244+
Assert.Contains("# Visible", result.Markdown);
245+
Assert.DoesNotContain("Fallback Text", result.Markdown);
246+
}
247+
103248
[Fact]
104249
public async Task ConvertAsync_HtmlWithLink_CreatesMarkdownLink()
105250
{
@@ -221,4 +366,4 @@ public async Task ConvertAsync_EmptyHtml_ReturnsEmptyString()
221366
// Assert
222367
Assert.Equal(string.Empty, result.Markdown);
223368
}
224-
}
369+
}

tests/MarkItDown.Tests/MarkItDown.Tests.csproj

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,6 @@
11
<Project Sdk="Microsoft.NET.Sdk">
22

33
<PropertyGroup>
4-
<TargetFramework>net9.0</TargetFramework>
5-
<ImplicitUsings>enable</ImplicitUsings>
6-
<Nullable>enable</Nullable>
7-
<LangVersion>preview</LangVersion>
84
<IsPackable>false</IsPackable>
95
</PropertyGroup>
106

0 commit comments

Comments
 (0)