Skip to content

Commit 015e363

Browse files
committed
improved entity error handling
1 parent 48b4a7f commit 015e363

7 files changed

Lines changed: 53 additions & 16 deletions

File tree

src/main/java/org/htmlunit/cyberneko/HTMLScanner.java

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1270,11 +1270,9 @@ else if (c == '<') {
12701270
fCurrentEntity.rewind();
12711271
break;
12721272
}
1273-
else {
1274-
if (!str.appendCodePoint(c)) {
1275-
if (fReportErrors_) {
1276-
fErrorReporter.reportError("HTML1005", new Object[] {"&#" + c + ';'});
1277-
}
1273+
else if (!str.appendCodePoint(c)) {
1274+
if (fReportErrors_) {
1275+
fErrorReporter.reportError("HTML1005", new Object[] {"&#" + c + ';'});
12781276
}
12791277
}
12801278
}
@@ -1489,6 +1487,11 @@ protected int scanEntityRef(final XMLString str, final XMLString plainValue, fin
14891487
}
14901488
while (nextChar != -1 && fUnicodeEntitiesParser.parseNumeric(nextChar));
14911489

1490+
// EOF finalization for numeric character reference without semicolon
1491+
if (nextChar == -1) {
1492+
fUnicodeEntitiesParser.finalizeNumericAtEOF();
1493+
}
1494+
14921495
final String match = fUnicodeEntitiesParser.getMatch();
14931496
if (match == null) {
14941497
fCurrentEntity.rewind(str.length() - 1);
@@ -2567,11 +2570,9 @@ private void scanUntilEndTag(final String tagNameWithLeadingSlash) throws IOExce
25672570
fScanUntilEndTag.append('\n');
25682571
}
25692572
}
2570-
else {
2571-
if (!fScanUntilEndTag.appendCodePoint(c)) {
2572-
if (fReportErrors_) {
2573-
fErrorReporter.reportError("HTML1005", new Object[] {"&#" + c + ';'});
2574-
}
2573+
else if (!fScanUntilEndTag.appendCodePoint(c)) {
2574+
if (fReportErrors_) {
2575+
fErrorReporter.reportError("HTML1005", new Object[] {"&#" + c + ';'});
25752576
}
25762577
}
25772578
}
@@ -2932,11 +2933,9 @@ else if (c == '>') {
29322933
fDocumentHandler.comment(fStringBuffer, locationAugs(fCurrentEntity));
29332934
return SCAN_TRUE;
29342935
}
2935-
else {
2936-
if (!fStringBuffer.appendCodePoint(c)) {
2937-
if (fReportErrors_) {
2938-
fErrorReporter.reportError("HTML1005", new Object[] {"&#" + c + ';'});
2939-
}
2936+
else if (!fStringBuffer.appendCodePoint(c)) {
2937+
if (fReportErrors_) {
2938+
fErrorReporter.reportError("HTML1005", new Object[] {"&#" + c + ';'});
29402939
}
29412940
}
29422941
}

src/main/java/org/htmlunit/cyberneko/HTMLUnicodeEntitiesParser.java

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ public void setMatchFromCode() {
7373
// Set the character reference code to 0xFFFD.
7474
// If the number is greater than 0x10FFFF, then this is
7575
// a character-reference-outside-unicode-range parse error. Set the character reference code to 0xFFFD.
76-
if ((0x00 == code_) || (code_ > 0x10FFFF)) {
76+
if ((0x00 >= code_) || (code_ > 0x10FFFF)) {
7777
match_ = "\uFFFD";
7878
matchLength_ = consumedCount_;
7979
return;
@@ -326,4 +326,14 @@ public boolean parseNumeric(final int current) {
326326
return false;
327327
}
328328

329+
public void finalizeNumericAtEOF() {
330+
if (match_ != null) {
331+
return;
332+
}
333+
if (state_ == STATE_DECIMAL_CHAR || state_ == STATE_HEXADECIMAL_CHAR) {
334+
// treat EOF like "missing semicolon": finalize what we have
335+
setMatchFromCode();
336+
matchLength_ = consumedCount_;
337+
}
338+
}
329339
}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
<div>abc&#11111111111
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
(html
2+
(head
3+
)head
4+
(body
5+
(div
6+
"abc�
7+
)div
8+
)body
9+
)html
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
(div
2+
"abc�
3+
)div
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
<html><head></head><body><div>abc�</div></body></html>
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
startDocument [(1,1,0) (1,1,0) false]
2+
startElement (localpart="html",rawname="html",uri="http://www.w3.org/1999/xhtml") [(-1,-1,-1) (-1,-1,-1) true]
3+
startElement (localpart="head",rawname="head",uri="http://www.w3.org/1999/xhtml") [(-1,-1,-1) (-1,-1,-1) true]
4+
endElement (localpart="head",rawname="head",uri="http://www.w3.org/1999/xhtml") [(-1,-1,-1) (-1,-1,-1) true]
5+
startElement (localpart="body",rawname="body",uri="http://www.w3.org/1999/xhtml") [(-1,-1,-1) (-1,-1,-1) true]
6+
startElement (localpart="div",rawname="div") [(1,1,0) (1,6,5) false]
7+
characters 'abc'[(1,6,5) (1,9,8) false]
8+
characters '�'[(1,9,8) (1,22,21) false]
9+
characters '
10+
'[(1,22,21) (2,1,22) false]
11+
endElement (localpart="div",rawname="div") [(-1,-1,-1) (-1,-1,-1) true]
12+
endElement (localpart="body",rawname="body",uri="http://www.w3.org/1999/xhtml") [(-1,-1,-1) (-1,-1,-1) true]
13+
endElement (localpart="html",rawname="html",uri="http://www.w3.org/1999/xhtml") [(-1,-1,-1) (-1,-1,-1) true]
14+
endDocument [(2,1,22) (2,1,22) false]

0 commit comments

Comments
 (0)