-
Notifications
You must be signed in to change notification settings - Fork 163
Markdown to html improvement #965
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,6 +21,68 @@ open MarkdownUtils | |
| let internal htmlEncode (code: string) = | ||
| code.Replace("&", "&").Replace("<", "<").Replace(">", ">") | ||
|
|
||
| /// Encode emojis and problematic Unicode characters as HTML numeric entities | ||
| /// Encodes characters in emoji ranges and symbols, but preserves common international text | ||
| let internal encodeHighUnicode (text: string) = | ||
| if String.IsNullOrEmpty text then | ||
| text | ||
| else | ||
| // Single-pass encoding with lazy StringBuilder allocation | ||
| let mutable sb: System.Text.StringBuilder voption = ValueNone | ||
| let mutable i = 0 | ||
|
|
||
| while i < text.Length do | ||
| let c = text.[i] | ||
|
|
||
| let needsEncoding, codePoint, skipNext = | ||
| // Check for surrogate pairs first (emojis and other characters outside BMP) | ||
| if | ||
| Char.IsHighSurrogate c | ||
| && i + 1 < text.Length | ||
| && Char.IsLowSurrogate text.[i + 1] | ||
| then | ||
| let fullCodePoint = Char.ConvertToUtf32(c, text.[i + 1]) | ||
| // Encode all characters outside BMP (>= 0x10000) as they're typically emojis | ||
| true, fullCodePoint, true | ||
| else | ||
| let codePoint = int c | ||
| // Encode specific ranges that contain emojis and symbols: | ||
| // U+2000-U+2BFF: General Punctuation, Superscripts, Currency, Dingbats, Arrows, Math, Technical, Box Drawing, etc. | ||
| // U+1F000-U+1FFFF: Supplementary Multilingual Plane emojis (handled above via surrogates) | ||
| (codePoint >= 0x2000 && codePoint <= 0x2BFF), codePoint, false | ||
|
|
||
| if needsEncoding then | ||
| // Lazy initialization of StringBuilder only when needed | ||
| match sb with | ||
| | ValueNone -> | ||
| let builder = System.Text.StringBuilder(text.Length + 16) | ||
|
|
||
| if i > 0 then | ||
| builder.Append(text, 0, i) |> ignore | ||
|
|
||
| sb <- ValueSome builder | ||
| | ValueSome _ -> () | ||
|
|
||
| // Append HTML entity without using sprintf (avoid allocation) | ||
| match sb with | ||
| | ValueSome builder -> | ||
| builder.Append "&#" |> ignore | ||
| builder.Append codePoint |> ignore | ||
| builder.Append ';' |> ignore | ||
| | ValueNone -> () | ||
| else | ||
| // Only append to StringBuilder if it was already initialized | ||
| match sb with | ||
| | ValueSome builder -> builder.Append c |> ignore | ||
| | ValueNone -> () | ||
|
|
||
| i <- i + (if skipNext then 2 else 1) | ||
|
|
||
| // Return original string if no encoding was needed | ||
| match sb with | ||
| | ValueNone -> text | ||
| | ValueSome builder -> builder.ToString() | ||
|
|
||
| /// Basic escaping as done by Markdown including quotes | ||
| let internal htmlEncodeQuotes (code: string) = | ||
| (htmlEncode code).Replace("\"", """) | ||
|
|
@@ -78,7 +140,7 @@ let rec internal formatSpan (ctx: FormattingContext) span = | |
|
|
||
| | AnchorLink(id, _) -> ctx.Writer.Write("<a name=\"" + htmlEncodeQuotes id + "\"> </a>") | ||
| | EmbedSpans(cmd, _) -> formatSpans ctx (cmd.Render()) | ||
| | Literal(str, _) -> ctx.Writer.Write(str) | ||
| | Literal(str, _) -> ctx.Writer.Write(encodeHighUnicode str) | ||
| | HardLineBreak(_) -> ctx.Writer.Write("<br />" + ctx.Newline) | ||
|
Comment on lines
141
to
144
|
||
| | IndirectLink(body, _, LookupKey ctx.Links (link, title), _) | ||
| | DirectLink(body, link, title, _) -> | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -30,6 +30,38 @@ let ``Escape HTML entities inside of code`` () = | |
| |> Markdown.ToHtml | ||
| |> should contain "<p><code>a &gt; & b</code></p>" | ||
|
|
||
| [<Test>] | ||
| let ``Emojis are encoded as HTML numeric entities`` () = | ||
| let html = "Like this 🎉🚧⭐⚠️✅" |> Markdown.ToHtml | ||
| html |> should contain "🎉" // 🎉 party popper | ||
| html |> should contain "🚧" // 🚧 construction | ||
| html |> should contain "⭐" // ⭐ star | ||
| html |> should contain "⚠" // ⚠️ warning | ||
| html |> should contain "✅" // ✅ check mark | ||
|
Comment on lines
+34
to
+40
|
||
|
|
||
| [<Test>] | ||
| let ``Regular text without emojis is not modified`` () = | ||
| // Fast path optimization: regular text should pass through unchanged | ||
| let html = "This is regular text with пристаням Cyrillic and 中文 Chinese" |> Markdown.ToHtml | ||
| html |> should contain "пристаням" | ||
| html |> should contain "中文" | ||
| html |> should not' (contain "&#") // No HTML entities for regular international text | ||
|
|
||
| [<Test>] | ||
| let ``List without blank line after heading`` () = | ||
| // Test the issue mentioned in comment: https://github.com/fsprojects/FSharp.Formatting/issues/964#issuecomment-3515381382 | ||
| let markdown = | ||
| """# This is my title | ||
| - this list | ||
| - should render""" | ||
|
|
||
| let html = Markdown.ToHtml markdown | ||
| // Check if list is rendered as a separate element, not part of heading | ||
| html |> should contain "<h1>This is my title</h1>" | ||
| html |> should contain "<ul>" | ||
| html |> should contain "<li>this list</li>" | ||
| html |> should contain "<li>should render</li>" | ||
|
|
||
| [<Test>] | ||
| let ``Inline HTML tag containing 'at' is not turned into hyperlink`` () = | ||
| let doc = """<a href="mailto:a@b.c">hi</a>""" |> Markdown.Parse | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The comment "Encode all characters outside BMP (>= 0x10000) as they're typically emojis" is misleading: many non-emoji scripts/symbol sets live outside the BMP. If the intended behavior is "encode all non-BMP code points to avoid output encoding issues", consider rewording the comment to reflect that rationale rather than implying they're mostly emojis.