Skip to content

Commit c4cc706

Browse files
committed
Merge branch 'main' into close-tags
Move tests of option closeAllByClose to test/tests/unclosedtags.js
2 parents eac4de9 + f7a190c commit c4cc706

10 files changed

Lines changed: 1199 additions & 886 deletions

File tree

.github/workflows/build.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,13 @@ jobs:
1717

1818
steps:
1919
- name: Checkout
20-
uses: actions/checkout@v2
20+
uses: actions/checkout@v4
2121

2222
- name: Determine Yarn Cache Path
2323
id: yarn-cache-dir-path
24-
run: echo "::set-output name=dir::$(yarn cache dir)"
24+
run: echo "dir=$(yarn cache dir)" >> $GITHUB_OUTPUT
2525

26-
- uses: actions/cache@v1
26+
- uses: actions/cache@v4
2727
id: yarn-cache # use this to check for `cache-hit` (`steps.yarn-cache.outputs.cache-hit != 'true'`)
2828
with:
2929
path: ${{ steps.yarn-cache-dir-path.outputs.dir }}
@@ -32,7 +32,7 @@ jobs:
3232
${{ runner.os }}-yarn-
3333
3434
- name: Use Node.js ${{ matrix.node-version }}
35-
uses: actions/setup-node@v1
35+
uses: actions/setup-node@v4
3636
with:
3737
node-version: ${{ matrix.node-version }}
3838

.github/workflows/publish.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,10 @@ jobs:
1111

1212
steps:
1313
- name: Checkout
14-
uses: actions/checkout@v2
14+
uses: actions/checkout@v4
1515

16-
- name: Setup Node.js 22.x to publish to npmjs.org
17-
uses: actions/setup-node@v1
16+
- name: Setup Node.js to publish to npmjs.org
17+
uses: actions/setup-node@v4
1818
with:
1919
node-version: "18.x"
2020
registry-url: "https://registry.npmjs.org"

CHANGELOG.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,16 @@
22

33
All notable changes to this project will be documented in this file. See [standard-version](https://github.com/conventional-changelog/standard-version) for commit guidelines.
44

5+
### [7.0.2](https://github.com/taoqf/node-fast-html-parser/compare/v7.0.1...v7.0.2) (2026-01-07)
6+
7+
8+
### Bug Fixes
9+
10+
* [#227](https://github.com/taoqf/node-fast-html-parser/issues/227) ([51528c4](https://github.com/taoqf/node-fast-html-parser/commit/51528c41ef2648d6c4dc1aecd14ee9d2b0083c4f))
11+
* [#294](https://github.com/taoqf/node-fast-html-parser/issues/294) Closing tag is missing but valid HTML is still not parseable ([950865f](https://github.com/taoqf/node-fast-html-parser/commit/950865fab5f4df7853b36712869b71c90f4d3a1b))
12+
* add missing dev dependency: yarn ([6d73ea3](https://github.com/taoqf/node-fast-html-parser/commit/6d73ea37c48f4170c35907869ba410c5122a9a1f))
13+
* test valid.js ([a81fc46](https://github.com/taoqf/node-fast-html-parser/commit/a81fc46fab2507615b0362150d62568a6f52ee4e))
14+
515
### [7.0.1](https://github.com/taoqf/node-fast-html-parser/compare/v7.0.0...v7.0.1) (2024-12-26)
616

717

README.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ import { parse } from 'node-html-parser';
4343

4444
const root = parse('<ul id="list"><li>Hello World</li></ul>');
4545

46+
// parse() adds a wrapper node, so the input data's first node is the root's first child node
4647
console.log(root.firstChild.structure);
4748
// ul#list
4849
// li
@@ -74,7 +75,7 @@ var root = HTMLParser.parse('<ul id="list"><li>Hello World</li></ul>');
7475

7576
### parse(data[, options])
7677

77-
Parse the data provided, and return the root of the generated DOM.
78+
Parse the data provided, wrap the result in a new node, and return the root of the generated DOM.
7879

7980
- **data**, data to parse
8081
- **options**, parse options
@@ -350,11 +351,11 @@ Get all child elements, so all child nodes of type HTMLELement.
350351

351352
### firstChild
352353

353-
Get first child node. `undefined` if the node has no children.
354+
Get first child node of the wrapper node added by `parse()`. `undefined` if the node has no children.
354355

355356
### lastChild
356357

357-
Get last child node. `undefined` if the node has no children.
358+
Get last child node of the wrapper node added by `parse()`. `undefined` if the node has no children.
358359

359360
### firstElementChild
360361

package.json

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "node-html-parser",
3-
"version": "7.0.1",
3+
"version": "7.0.2",
44
"description": "A very fast HTML parser, generating a simplified DOM, with basic element query support.",
55
"main": "dist/index.js",
66
"types": "dist/index.d.ts",
@@ -87,7 +87,8 @@
8787
"standard-version": "^9.5.0",
8888
"travis-cov": "latest",
8989
"ts-node": "^10.9.1",
90-
"typescript": "latest"
90+
"typescript": "latest",
91+
"yarn": "^1.22.22"
9192
},
9293
"config": {
9394
"blanket": {

src/nodes/html.ts

Lines changed: 53 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -139,8 +139,8 @@ export default class HTMLElement extends Node {
139139
private _attrs: Attributes;
140140
private _rawAttrs: RawAttributes;
141141
private _parseOptions: Partial<Options>;
142+
private _id: string;
142143
public rawTagName: string; // there is not friend funciton in es
143-
public id: string;
144144
public classList: DOMTokenList;
145145

146146
/**
@@ -185,7 +185,7 @@ export default class HTMLElement extends Node {
185185
super(parentNode, range);
186186
this.rawTagName = tagName;
187187
this.rawAttrs = rawAttrs || '';
188-
this.id = keyAttrs.id || '';
188+
this._id = keyAttrs.id || '';
189189
this.childNodes = [];
190190
this._parseOptions = _parseOptions;
191191
this.classList = new DOMTokenList(
@@ -248,6 +248,13 @@ export default class HTMLElement extends Node {
248248
return this.voidTag.isVoidElement(this.localName);
249249
}
250250

251+
public get id() {
252+
return this._id;
253+
}
254+
public set id(newid: string) {
255+
this.setAttribute('id', newid);
256+
}
257+
251258
/**
252259
* Get escpaed (as-it) text value of current node and its children.
253260
* @return {string} text content
@@ -417,7 +424,7 @@ export default class HTMLElement extends Node {
417424
res.push(' '.repeat(indention) + str);
418425
}
419426
function dfs(node: HTMLElement) {
420-
const idStr = node.id ? `#${node.id}` : '';
427+
const idStr = node._id ? `#${node._id}` : '';
421428
const classStr = node.classList.length ? `.${node.classList.value.join('.')}` : ''; // eslint-disable-line @typescript-eslint/no-unsafe-member-access, @typescript-eslint/no-unsafe-member-access, @typescript-eslint/restrict-template-expressions, @typescript-eslint/no-unsafe-call
422429
write(`${node.rawTagName}${idStr}${classStr}`);
423430
indention++;
@@ -565,7 +572,7 @@ export default class HTMLElement extends Node {
565572
}
566573

567574
if (child.nodeType === NodeType.ELEMENT_NODE) {
568-
if (child.id === id) {
575+
if (child._id === id) {
569576
return child;
570577
}
571578

@@ -716,9 +723,9 @@ export default class HTMLElement extends Node {
716723
return `${name}=${val}`;
717724
})
718725
.join(' ');
719-
// Update this.id
726+
// Update this._id
720727
if (key === 'id') {
721-
this.id = '';
728+
this._id = '';
722729
}
723730
return this;
724731
}
@@ -765,9 +772,9 @@ export default class HTMLElement extends Node {
765772
return `${name}=${val}`;
766773
})
767774
.join(' ');
768-
// Update this.id
775+
// Update this._id
769776
if (key === 'id') {
770-
this.id = value;
777+
this._id = value;
771778
}
772779
return this;
773780
}
@@ -793,6 +800,10 @@ export default class HTMLElement extends Node {
793800
return `${name}=${this.quoteAttribute(String(val))}`;
794801
})
795802
.join(' ');
803+
// Update this._id
804+
if ('id' in attributes) {
805+
this._id = attributes['id'];
806+
}
796807
return this;
797808
}
798809

@@ -1006,6 +1017,9 @@ const kElementsClosedByClosing = {
10061017
th: { tr: true, table: true, TR: true, TABLE: true },
10071018
TH: { tr: true, table: true, TR: true, TABLE: true },
10081019
} as Record<string, Record<string, boolean>>;
1020+
const kElementsClosedByClosingExcept = {
1021+
p: { a: true, audio: true, del: true, ins: true, map: true, noscript: true, video: true },
1022+
} as Record<string, Record<string, boolean>>;
10091023

10101024
export interface Options {
10111025
lowerCaseTagName?: boolean;
@@ -1192,22 +1206,44 @@ export function base_parse(data: string, options = {} as Partial<Options>) {
11921206
continue;
11931207
}
11941208
}
1195-
if (options.closeAllByClosing === true) {
1196-
// If tag was opened, close all nested tags
1197-
let i;
1198-
for (i = stack.length - 2; i >= 0; i--) {
1199-
if (stack[i].rawTagName === tagName) break;
1200-
}
1201-
if (i >= 0) {
1202-
while (stack.length > i) {
1209+
const openTag =
1210+
currentParent.rawTagName ?
1211+
currentParent.rawTagName.toLowerCase() :
1212+
'';
1213+
if (kElementsClosedByClosingExcept[openTag]) {
1214+
const closingTag = tagName.toLowerCase();
1215+
if (stack.length > 1) {
1216+
const possibleContainer = stack[stack.length - 2];
1217+
if (
1218+
possibleContainer &&
1219+
possibleContainer.rawTagName &&
1220+
possibleContainer.rawTagName.toLowerCase() === closingTag &&
1221+
!kElementsClosedByClosingExcept[openTag][closingTag]
1222+
) {
12031223
// Update range end for closed tag
12041224
(<[number, number]>currentParent.range)[1] = createRange(-1, Math.max(lastTextPos, tagEndPos))[1];
12051225
stack.pop();
12061226
currentParent = arr_back(stack);
1227+
continue;
12071228
}
1208-
continue;
12091229
}
12101230
}
1231+
if (options.closeAllByClosing === true) {
1232+
// If tag was opened, close all nested tags
1233+
let i;
1234+
for (i = stack.length - 2; i >= 0; i--) {
1235+
if (stack[i].rawTagName === tagName) break;
1236+
}
1237+
if (i >= 0) {
1238+
while (stack.length > i) {
1239+
// Update range end for closed tag
1240+
(<[number, number]>currentParent.range)[1] = createRange(-1, Math.max(lastTextPos, tagEndPos))[1];
1241+
stack.pop();
1242+
currentParent = arr_back(stack);
1243+
}
1244+
continue;
1245+
}
1246+
}
12111247
// Use aggressive strategy to handle unmatching markups.
12121248
break;
12131249
}

test/tests/issues/294.js

Lines changed: 7 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,12 @@
11
const { parse, valid } = require('@test/test-target');
22

3-
describe('issue 294 Closing tag is missing but valid HTML is still not parseable', function () {
4-
it('invalid HTML missing closing tag should not parse', function () {
5-
const content = '<body><main class=h-entry><p>hello</main></body>';
6-
valid(content).should.equal(false);
3+
describe('issue 294 Closing tag is missing but valid HTML still not parsable', function () {
4+
it('Valid HTML missing closing p tag should parse', function () {
5+
const content = '<body><main class=h-entry><p>hello</main></body>';
6+
valid(content).should.equal(true);
77
const root = parse(content);
8-
const list = root.querySelectorAll('.h-entry');
9-
list.length.should.equal(0);
10-
});
11-
it('invalid HTML missing closing tag should parse', function () {
12-
const content = '<body><main class=h-entry><p>hello</main></body>';
13-
valid(content).should.equal(false);
14-
const root = parse(content, {
15-
closeAllByClosing: true
16-
});
17-
const list = root.querySelectorAll('.h-entry');
18-
list.length.should.equal(1);
8+
root.outerHTML.should.equal('<body><main class=h-entry><p>hello</p></main></body>');
9+
const list = root.querySelectorAll('.h-entry');
10+
list.length.should.equal(1);
1911
});
2012
});

test/tests/unclosedtags.js

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
const { parse, valid } = require('@test/test-target');
2+
3+
describe('Unclosed tags', function () {
4+
it('Unclosed tags should be closed at end of parent element', function () {
5+
const html = '<p>not bold<b>bold</p><p>more</p>';
6+
valid(html, { closeAllByClosing: true }).should.be.true();
7+
const root = parse(html, { closeAllByClosing: true });
8+
root.outerHTML.should.equal('<p>not bold<b>bold</b></p><p>more</p>');
9+
});
10+
it('Nested unclosed tags should be closed at end of parent element', function () {
11+
const html = '<p>not bold<b>bold<i>bold italic</p><p>more</p>';
12+
valid(html, { closeAllByClosing: true }).should.be.true();
13+
const root = parse(html, { closeAllByClosing: true });
14+
root.outerHTML.should.equal('<p>not bold<b>bold<i>bold italic</i></b></p><p>more</p>');
15+
});
16+
});

test/tests/valid.js

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,25 @@ describe('parseWithValidation', function () {
2020
result.should.eql(false);
2121
})
2222

23-
it('hillcrestpartyrentals.html should return Object with valid: false. not closing <p> tag on line 476', function () {
23+
// #294: Closing tag is missing but valid HTML is still not parseable
24+
//
25+
// Tag omission in text/html:
26+
// A p element's end tag can be omitted if the p element is immediately
27+
// followed by an address, article, aside, blockquote, details, dialog,
28+
// div, dl, fieldset, figcaption, figure, footer, form, h1, h2, h3, h4,
29+
// h5, h6, header, hgroup, hr, main, menu, nav, ol, p, pre, search,
30+
// section, table, or ul element, or if there is no more content in the
31+
// parent element and the parent element is an HTML element that is not
32+
// an a, audio, del, ins, map, noscript, or video element, or an
33+
// autonomous custom element.
34+
//
35+
// Based on this, hillcrestpartyrentals.html is in fact valid HTML. All
36+
// the p elements missing close tags are contained within td elements
37+
// and, therefore, should be closed when there is no more content in the
38+
// parent td element (i.e. at the `</td>`).
39+
it('hillcrestpartyrentals.html should return Object with valid: true. not closing <p> tag on line 476', function () {
2440
const result = valid(fs.readFileSync(__dirname + '/../assets/html/hillcrestpartyrentals.html').toString());
25-
result.should.eql(false);
41+
result.should.eql(true);
2642
})
2743

2844
it('google.html should return Object with valid: true', function () {

0 commit comments

Comments
 (0)