Skip to content

Commit 994f955

Browse files
authored
Merge pull request #17 from lupodevelop/14-html-escaping-and-unescaping-is-slow-and-incomplete
14 html escaping and unescaping is slow and incomplete
2 parents 7a25a3a + b586df5 commit 994f955

10 files changed

Lines changed: 254 additions & 15 deletions

.github/workflows/ci.yml

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
name: test
2+
3+
on:
4+
push:
5+
branches:
6+
- master
7+
- main
8+
pull_request:
9+
10+
jobs:
11+
test:
12+
runs-on: ubuntu-latest
13+
steps:
14+
- uses: actions/checkout@v4
15+
- uses: erlef/setup-beam@v1
16+
with:
17+
otp-version: "28"
18+
gleam-version: "1.13.0"
19+
rebar3-version: "3"
20+
# elixir-version: "1"
21+
- run: gleam deps download
22+
- run: gleam test
23+
- run: gleam format --check src test

CHANGELOG.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,24 @@
22

33
All notable changes to this project are documented in this file.
44

5+
## [1.2.3] - 2026-01-08
6+
### Changed
7+
- Replaced `escape_html` implementation with `houdini.escape` for faster,
8+
allocation-friendly HTML escaping.
9+
- Replaced `unescape_html` with `odysseus.unescape` for comprehensive HTML
10+
entity unescaping (named entities, numeric decimal and hex entities).
11+
- Added dependencies: `houdini`, `odysseus`.
12+
13+
### Tests
14+
- Added tests for HTML escape/unescape and numeric entities (decimal and hex).
15+
16+
Contributed by: Daniele (`lupodevelop`)
17+
Suggested by: Louis Pilfold (`@lpil`)
18+
19+
Suggested by: NNB (`@NNBnh`)
20+
Suggested change: updated README logo pointer to use the raw.githubusercontent URL
21+
(pointing to the repository commit) so the logo is resolvable on Hexdocs.
22+
523
## [1.2.2] - 2026-01-05
624
### Added
725
- Added internal helper `grapheme_len/1` (internal) to centralize grapheme cluster length computation and avoid repetitive `string.to_graphemes |> list.length` patterns.

CONTRIBUTING.md

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# Contributing to str
2+
3+
Thanks for helping! Short, practical guide.
4+
5+
## Quick start
6+
- Fork, create a branch: `git switch -c feat/your-change`.
7+
- Run `gleam format` and `gleam test` locally.
8+
- Open a PR against `main` with a short description and tests.
9+
10+
## Setup
11+
- Requirements: Gleam (see `gleam.toml`)
12+
13+
Commands:
14+
```bash
15+
gleam format
16+
gleam test
17+
```
18+
19+
## Commits
20+
Use brief prefixes: `feat:`, `fix:`, `chore:`, `test:`, `perf:`.
21+
Example: `feat(display): add truncate_display`
22+
No strict enforcement, use these prefixes as a guideline, not a hard rule.
23+
24+
## PR checklist
25+
- [ ] Tests added/updated
26+
- [ ] `gleam format` & `gleam test` pass
27+
- [ ] Update `CHANGELOG.md` if behaviour changes
28+
- [ ] Document noteworthy changes in `README.md` , docs/ or examples/
29+
30+
## Deprecations
31+
- Report breaking changes in an issue and add migration notes in PRs. See `DEPRECATIONS.md` if present.
32+
33+
## Testing
34+
- Add unit tests for edge cases (ZWJ, skin tones, combining marks, CJK, ambiguous widths).

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
<p align="center">
2-
<img src="assets/img/logo-str.png" alt="str logo" width="280">
2+
<img src="https://raw.githubusercontent.com/lupodevelop/str/c190b21/assets/img/logo-str.png" alt="str logo" width="280">
33
</p>
44

55
<h1 align="center">str</h1>
@@ -327,6 +327,8 @@ gleam test
327327
python3 scripts/generate_character_tables.py
328328
```
329329

330+
Note: as of **1.2.3**, `escape_html` now uses the `houdini` library for fast, allocation‑friendly escaping, and `unescape_html` uses `odysseus` for comprehensive entity support (named, decimal and hex numeric entities). See [CHANGELOG.md](CHANGELOG.md) for details.
331+
330332
---
331333

332334
## 📊 Test Coverage

gleam.toml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,18 @@
11
name = "str"
2-
version = "1.2.2"
2+
version = "1.2.3"
33

44
# Project metadata (fill or replace placeholders before publishing)
55
description = "Unicode-aware string utilities for Gleam: grapheme-safe operations, pragmatic ASCII transliteration, and slug generation."
66
licenses = ["MIT"]
77
repository = { type = "github", user = "lupodevelop", repo = "str" }
8-
links = [{ title = "Repository", href = "https://github.com/lupodevelop/str" }]
98

109
# For a full reference of all the available options, see:
1110
# https://gleam.run/writing-gleam/gleam-toml/
1211

1312
[dependencies]
1413
gleam_stdlib = ">= 0.44.0 and < 2.0.0"
14+
houdini = ">= 1.0.0 and < 2.0.0"
15+
odysseus = ">= 1.0.0 and < 2.0.0"
1516

1617
[dev-dependencies]
1718
gleeunit = ">= 1.0.0 and < 2.0.0"

manifest.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,12 @@
44
packages = [
55
{ name = "gleam_stdlib", version = "0.65.0", build_tools = ["gleam"], requirements = [], otp_app = "gleam_stdlib", source = "hex", outer_checksum = "7C69C71D8C493AE11A5184828A77110EB05A7786EBF8B25B36A72F879C3EE107" },
66
{ name = "gleeunit", version = "1.9.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "gleeunit", source = "hex", outer_checksum = "DA9553CE58B67924B3C631F96FE3370C49EB6D6DC6B384EC4862CC4AAA718F3C" },
7+
{ name = "houdini", version = "1.2.0", build_tools = ["gleam"], requirements = [], otp_app = "houdini", source = "hex", outer_checksum = "5DB1053F1AF828049C2B206D4403C18970ABEF5C18671CA3C2D2ED0DD64F6385" },
8+
{ name = "odysseus", version = "1.0.0", build_tools = ["gleam"], requirements = [], otp_app = "odysseus", source = "hex", outer_checksum = "6A97DA1075BDDEA8B60F47B1DFFAD49309FA27E73843F13A0AF32EA7087BA11C" },
79
]
810

911
[requirements]
1012
gleam_stdlib = { version = ">= 0.44.0 and < 2.0.0" }
1113
gleeunit = { version = ">= 1.0.0 and < 2.0.0" }
14+
houdini = { version = ">= 1.0.0 and < 2.0.0" }
15+
odysseus = { version = ">= 1.0.0 and < 2.0.0" }

src/str/core.gleam

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ import gleam/dict
1313
import gleam/int
1414
import gleam/list
1515
import gleam/string
16+
import houdini
17+
import odysseus
1618
import str/config
1719

1820
/// Detects if a grapheme cluster likely contains emoji components.
@@ -1766,12 +1768,7 @@ pub fn is_hex(text: String) -> Bool {
17661768
/// escape_html("Say \"hello\"") -> "Say &quot;hello&quot;"
17671769
///
17681770
pub fn escape_html(text: String) -> String {
1769-
text
1770-
|> string.replace("&", "&amp;")
1771-
|> string.replace("<", "&lt;")
1772-
|> string.replace(">", "&gt;")
1773-
|> string.replace("\"", "&quot;")
1774-
|> string.replace("'", "&#39;")
1771+
houdini.escape(text)
17751772
}
17761773

17771774
/// Unescapes HTML entities to their character equivalents.
@@ -1781,12 +1778,7 @@ pub fn escape_html(text: String) -> String {
17811778
/// unescape_html("Tom &amp; Jerry") -> "Tom & Jerry"
17821779
///
17831780
pub fn unescape_html(text: String) -> String {
1784-
text
1785-
|> string.replace("&#39;", "'")
1786-
|> string.replace("&quot;", "\"")
1787-
|> string.replace("&gt;", ">")
1788-
|> string.replace("&lt;", "<")
1789-
|> string.replace("&amp;", "&")
1781+
odysseus.unescape(text)
17901782
}
17911783

17921784
/// Escapes regex metacharacters so the string can be used as a literal pattern.
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
import gleam/list
2+
import gleeunit
3+
import str
4+
5+
pub fn main() -> Nil {
6+
gleeunit.main()
7+
}
8+
9+
pub fn roundtrip_basic_entities_test() {
10+
let cases = [
11+
"<div>Hello</div>",
12+
"Tom & Jerry",
13+
"Say \"hello\"",
14+
"It's me",
15+
"5 < 10 && 10 > 5",
16+
"Ampersand: &",
17+
]
18+
19+
list.fold(cases, True, fn(_, s) {
20+
let escaped = str.escape_html(s)
21+
let unescaped = str.unescape_html(escaped)
22+
assert unescaped == s
23+
True
24+
})
25+
}
26+
27+
pub fn numeric_and_named_entities_test() {
28+
assert str.unescape_html("&lt;&gt;&amp;&#39;&#x27;&#34;") == "<>&''\""
29+
assert str.unescape_html("&quot; and &#34; and &#x22;") == "\" and \" and \""
30+
assert str.unescape_html("I like &#39;quotes&#39;") == "I like 'quotes'"
31+
assert str.unescape_html("Hex: &#x27;") == "Hex: '"
32+
}
33+
34+
pub fn malformed_and_unknown_entity_test() {
35+
// Missing semicolon should remain unchanged
36+
assert str.unescape_html("This &amp is broken") == "This &amp is broken"
37+
38+
// Unknown entity should remain unchanged
39+
assert str.unescape_html("This &notanentity; remains")
40+
== "This &notanentity; remains"
41+
}
42+
43+
pub fn combined_and_adjacent_entities_test() {
44+
assert str.unescape_html("&lt;&lt; &gt;&gt;") == "<< >>"
45+
assert str.unescape_html("&amp;&amp;&amp;") == "&&&"
46+
}
47+
48+
pub fn unicode_and_emoji_roundtrip_test() {
49+
let s = "Café — ️👩‍👩‍👧‍👦 \u{00A0}"
50+
let escaped = str.escape_html(s)
51+
// Expect unescape to restore the original (escape may not change emoji/nbspace)
52+
assert str.unescape_html(escaped) == s
53+
}
54+
55+
pub fn idempotence_and_double_escape_test() {
56+
let s = "&"
57+
let once = str.escape_html(s)
58+
let twice = str.escape_html(once)
59+
assert once == "&amp;"
60+
assert twice == "&amp;amp;"
61+
// unescape decodes one level: "&amp;amp;" -> "&amp;"; double unescape restores original
62+
assert str.unescape_html(twice) == "&amp;"
63+
assert str.unescape_html(str.unescape_html(twice)) == s
64+
}
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
import gleeunit
2+
import str
3+
import gleam/list
4+
import gleam/string
5+
6+
pub fn main() -> Nil {
7+
gleeunit.main()
8+
}
9+
10+
// Deterministic, simple generator over a token pool.
11+
fn gen_token_pool() -> List(String) {
12+
[
13+
"a","b","c","1","2","3"," ","\n","<",">","&","\"","'",
14+
"&amp;","&lt;","&gt;","&quot;","&#39;","&#x27;","&#x22;","&notanentity;",
15+
"&","&amp","&#", "&#x",
16+
"\u{00A0}", // NBSP
17+
"Café","naïve","ø","漢","字",
18+
"👩‍👩‍👧‍👦","👨‍👩‍👧","️","✈️","🏳️‍🌈",
19+
"\u{0301}", // combining acute
20+
"&alpha;","&beta;","&gamma;"
21+
]
22+
}
23+
24+
// Deterministic pseudo-random index using seed and i
25+
fn idx_for(seed: Int, i: Int, len: Int) -> Int {
26+
// simple LCG-ish formula; keep small to avoid large-int overhead
27+
let v = seed * 1103515245 + 12345 + i
28+
let v_pos = case v < 0 { True -> -v False -> v }
29+
v_pos % len
30+
}
31+
32+
fn gen_string(seed: Int, tokens: List(String), n: Int) -> String {
33+
let len = list.length(tokens)
34+
let seq = list.range(0, n - 1)
35+
seq
36+
|> list.map(fn(i) {
37+
let j = idx_for(seed, i, len)
38+
case list.drop(tokens, j) {
39+
[first, ..] -> first
40+
[] -> ""
41+
}
42+
})
43+
|> list.fold("", fn(acc, s) { acc <> s })
44+
}
45+
46+
fn run_cfg(seed: Int, n: Int, tokens: List(String)) -> Bool {
47+
let s = gen_string(seed, tokens, n)
48+
// Roundtrip: unescape(escape(s)) == s
49+
let escaped = str.escape_html(s)
50+
let unescaped = str.unescape_html(escaped)
51+
assert unescaped == s
52+
53+
// Escaped string must not contain raw angle brackets or quotes
54+
assert string.contains(escaped, "<") == False
55+
assert string.contains(escaped, ">") == False
56+
assert string.contains(escaped, "\"") == False
57+
assert string.contains(escaped, "'") == False
58+
59+
True
60+
}
61+
62+
pub fn fuzz_roundtrip_test() {
63+
let tokens = gen_token_pool()
64+
65+
run_cfg(1, 20, tokens)
66+
run_cfg(42, 50, tokens)
67+
run_cfg(123, 200, tokens)
68+
69+
True
70+
}

test/str_html_escape_test.gleam

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import str
2+
3+
pub fn escape_basic_test() {
4+
assert str.escape_html("<div>Hello</div>") == "&lt;div&gt;Hello&lt;/div&gt;"
5+
assert str.escape_html("Tom & Jerry") == "Tom &amp; Jerry"
6+
assert str.escape_html("Say \"hello\"") == "Say &quot;hello&quot;"
7+
}
8+
9+
pub fn unescape_basic_test() {
10+
assert str.unescape_html("&lt;div&gt;") == "<div>"
11+
assert str.unescape_html("Tom &amp; Jerry") == "Tom & Jerry"
12+
assert str.unescape_html("Say &quot;hello&quot;") == "Say \"hello\""
13+
assert str.unescape_html("It&#39;s me") == "It's me"
14+
}
15+
16+
pub fn roundtrip_test() {
17+
let s = "Hello & < > \""
18+
let escaped = str.escape_html(s)
19+
assert str.unescape_html(escaped) == s
20+
}
21+
22+
pub fn numeric_entities_test() {
23+
// Decimal numeric entity
24+
assert str.unescape_html("I like &#39;quotes&#39;") == "I like 'quotes'"
25+
26+
// Hex numeric entity
27+
assert str.unescape_html("Hex: &#x27;") == "Hex: '"
28+
29+
// Double quote numeric and hex
30+
assert str.unescape_html("&quot; and &#34; and &#x22;") == "\" and \" and \""
31+
}

0 commit comments

Comments
 (0)