Skip to content

Commit 0f90fc4

Browse files
authored
Language Server: Add hover support for HTML character references (#1512)
https://github.com/user-attachments/assets/7eff224c-d14c-4ccb-94dd-1f54e727c3d7
1 parent 6301abd commit 0f90fc4

8 files changed

Lines changed: 402 additions & 9 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ java/org/herb/ast/Nodes.java
100100
java/org/herb/ast/NodeVisitor.java
101101
java/org/herb/ast/Visitor.java
102102
javascript/packages/core/src/errors.ts
103+
javascript/packages/core/src/html-entities.json
103104
javascript/packages/core/src/node-type-guards.ts
104105
javascript/packages/core/src/nodes.ts
105106
javascript/packages/core/src/visitor.ts

javascript/packages/core/package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@
1414
"module": "./dist/herb-core.esm.js",
1515
"types": "./dist/types/index.d.ts",
1616
"scripts": {
17-
"build": "yarn clean && rollup -c",
17+
"download-html-entities": "node scripts/download-html-entities.mjs",
18+
"build": "yarn download-html-entities && yarn clean && rollup -c",
1819
"dev": "rollup -c -w",
1920
"clean": "rimraf dist",
2021
"test": "vitest run",
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
#!/usr/bin/env node
2+
3+
import { existsSync, writeFileSync } from "fs"
4+
import { fileURLToPath } from "url"
5+
import { dirname, join } from "path"
6+
7+
const __filename = fileURLToPath(import.meta.url)
8+
const __dirname = dirname(__filename)
9+
10+
const ENTITIES_URL = "https://html.spec.whatwg.org/entities.json"
11+
const OUTPUT_PATH = join(__dirname, "..", "src", "html-entities.json")
12+
13+
async function downloadEntities() {
14+
if (existsSync(OUTPUT_PATH)) {
15+
console.log(`HTML entities already present at ${OUTPUT_PATH}, skipping download.`)
16+
return
17+
}
18+
19+
console.log(`Downloading HTML named character references from ${ENTITIES_URL}...`)
20+
21+
const response = await fetch(ENTITIES_URL)
22+
23+
if (!response.ok) {
24+
throw new Error(`Failed to fetch entities: ${response.status} ${response.statusText}`)
25+
}
26+
27+
const data = await response.json()
28+
const entities = {}
29+
30+
for (const [key, value] of Object.entries(data).sort(([a], [b]) => a.localeCompare(b))) {
31+
if (key.endsWith(";")) {
32+
const name = key.slice(1, -1)
33+
entities[name] = {
34+
characters: value.characters,
35+
codepoints: value.codepoints,
36+
}
37+
}
38+
}
39+
40+
const count = Object.keys(entities).length
41+
42+
writeFileSync(OUTPUT_PATH, JSON.stringify(entities, null, 2) + "\n")
43+
44+
console.log(`Wrote ${count} named character references to ${OUTPUT_PATH}`)
45+
}
46+
47+
downloadEntities().catch((error) => {
48+
console.error(error)
49+
process.exit(1)
50+
})
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
// https://html.spec.whatwg.org/multipage/syntax.html#character-references
2+
// JSON source: https://html.spec.whatwg.org/entities.json
3+
import entities from "./html-entities.json"
4+
5+
export interface HTMLCharacterReference {
6+
characters: string
7+
codepoints: number[]
8+
}
9+
10+
export const HTML_NAMED_CHARACTER_REFERENCES: Record<string, HTMLCharacterReference> = entities
11+
12+
/**
13+
* Pattern that matches HTML character references: named (`&amp;`), decimal (`&#60;`), and hexadecimal (`&#x3C;`).
14+
* Uses capturing groups: group 1 = hex digits, group 2 = decimal digits, group 3 = named reference name.
15+
*
16+
* @see https://html.spec.whatwg.org/multipage/syntax.html#character-references
17+
*/
18+
export const CHARACTER_REFERENCE_PATTERN = /&(?:#x([0-9a-fA-F]+)|#([0-9]+)|([a-zA-Z][a-zA-Z0-9]*));/g
19+
20+
export function isNamedCharacterReference(name: string): boolean {
21+
return name in HTML_NAMED_CHARACTER_REFERENCES
22+
}
23+
24+
export function getNamedCharacterReference(name: string): HTMLCharacterReference | undefined {
25+
return HTML_NAMED_CHARACTER_REFERENCES[name]
26+
}
27+
28+
/**
29+
* Checks if a string is a valid HTML character reference.
30+
* Supports named (`&amp;`), decimal (`&#60;`), and hexadecimal (`&#x3C;`) references.
31+
*
32+
* @see https://html.spec.whatwg.org/multipage/syntax.html#character-references
33+
*/
34+
export function isValidCharacterReference(text: string): boolean {
35+
if (!text.startsWith("&") || !text.endsWith(";")) return false
36+
37+
if (text.startsWith("&#x")) return /^&#x[0-9a-fA-F]+;$/.test(text)
38+
if (text.startsWith("&#")) return /^&#[0-9]+;$/.test(text)
39+
40+
return isNamedCharacterReference(text.slice(1, -1))
41+
}

javascript/packages/core/src/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
export * from "./ast-utils.js"
22
export * from "./html-constants.js"
3+
export * from "./html-character-references.js"
34
export * from "./backend.js"
45
export * from "./diagnostic.js"
56
export * from "./didyoumean.js"

javascript/packages/language-server/src/hover_service.ts

Lines changed: 120 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,12 @@ import { TextDocument } from "vscode-languageserver-textdocument"
44
import { Visitor } from "@herb-tools/node-wasm"
55
import { IdentityPrinter } from "@herb-tools/printer"
66
import { ActionViewTagHelperToHTMLRewriter } from "@herb-tools/rewriter"
7-
import { isERBOpenTagNode, isHTMLElementNode } from "@herb-tools/core"
7+
import { isERBOpenTagNode, isHTMLElementNode, getNamedCharacterReference, CHARACTER_REFERENCE_PATTERN } from "@herb-tools/core"
88
import { ParserService } from "./parser_service"
99
import { lspPosition, isPositionInRange, rangeSize } from "./range_utils"
1010
import { ACTION_VIEW_HELPERS } from "./action_view_helpers"
1111

12-
import type { HTMLElementNode, ERBOpenTagNode } from "@herb-tools/core"
12+
import type { HTMLElementNode, ERBOpenTagNode, HTMLCharacterReference } from "@herb-tools/core"
1313

1414
class ActionViewElementCollector extends Visitor {
1515
public elements: { node: HTMLElementNode; openTag: ERBOpenTagNode; range: Range }[] = []
@@ -84,7 +84,7 @@ export class HoverService {
8484
}
8585

8686
if (!bestElement) {
87-
return null
87+
return this.getEntityHover(textDocument, position)
8888
}
8989

9090
const elementSource = bestElement.node.element_source
@@ -153,4 +153,121 @@ export class HoverService {
153153

154154
return IdentityPrinter.print(rewrittenNode)
155155
}
156+
157+
private getEntityHover(textDocument: TextDocument, position: Position): Hover | null {
158+
const lineText = textDocument.getText(Range.create(position.line, 0, position.line + 1, 0))
159+
const match = findCharacterReferenceAtPosition(lineText, position.character)
160+
161+
if (!match) return null
162+
163+
const range = Range.create(
164+
position.line, match.start,
165+
position.line, match.end,
166+
)
167+
168+
return {
169+
contents: {
170+
kind: MarkupKind.Markdown,
171+
value: formatCharacterReferenceHover(match),
172+
},
173+
range,
174+
}
175+
}
176+
}
177+
178+
interface CharacterReferenceMatch {
179+
reference: string
180+
start: number
181+
end: number
182+
characters: string
183+
codepoints: number[]
184+
type: "named" | "decimal" | "hexadecimal"
185+
name?: string
186+
}
187+
188+
function findCharacterReferenceAtPosition(lineText: string, character: number): CharacterReferenceMatch | null {
189+
const pattern = new RegExp(CHARACTER_REFERENCE_PATTERN.source, "g")
190+
let match: RegExpExecArray | null
191+
192+
while ((match = pattern.exec(lineText)) !== null) {
193+
const start = match.index
194+
const end = start + match[0].length
195+
196+
if (character < start || character >= end) continue
197+
198+
const reference = match[0]
199+
200+
if (match[1]) {
201+
const codepoint = parseInt(match[1], 16)
202+
return {
203+
reference,
204+
start,
205+
end,
206+
characters: String.fromCodePoint(codepoint),
207+
codepoints: [codepoint],
208+
type: "hexadecimal",
209+
}
210+
}
211+
212+
if (match[2]) {
213+
const codepoint = parseInt(match[2], 10)
214+
return {
215+
reference,
216+
start,
217+
end,
218+
characters: String.fromCodePoint(codepoint),
219+
codepoints: [codepoint],
220+
type: "decimal",
221+
}
222+
}
223+
224+
if (match[3]) {
225+
const entity = getNamedCharacterReference(match[3])
226+
227+
if (entity) {
228+
return {
229+
reference,
230+
start,
231+
end,
232+
characters: entity.characters,
233+
codepoints: entity.codepoints,
234+
type: "named",
235+
name: match[3],
236+
}
237+
}
238+
}
239+
}
240+
241+
return null
242+
}
243+
244+
function formatCodepoints(codepoints: number[]): string {
245+
return codepoints.map(codepoint => `U+${codepoint.toString(16).toUpperCase().padStart(4, "0")}`).join(", ")
246+
}
247+
248+
function formatCharacterReferenceHover(match: CharacterReferenceMatch): string {
249+
const parts: string[] = []
250+
251+
parts.push(`## \`${match.characters}\``)
252+
253+
const typeLabel = match.type === "named" ? "Named character reference"
254+
: match.type === "decimal" ? "Decimal numeric character reference"
255+
: "Hexadecimal numeric character reference"
256+
257+
parts.push(`**${typeLabel}**`)
258+
259+
const details: string[] = []
260+
details.push(`| Character | \`${match.characters}\` |`)
261+
details.push(`| Codepoint${match.codepoints.length > 1 ? "s" : ""} | ${formatCodepoints(match.codepoints)} |`)
262+
details.push(`| Reference | \`${match.reference}\` |`)
263+
264+
if (match.name) {
265+
details.push(`| Name | \`${match.name}\` |`)
266+
}
267+
268+
parts.push(`| | |\n|---|---|\n${details.join("\n")}`)
269+
270+
parts.push(`[HTML spec: Character references](https://html.spec.whatwg.org/multipage/syntax.html#character-references)`)
271+
272+
return parts.join("\n\n")
156273
}

0 commit comments

Comments
 (0)