Skip to content

Commit 94a671f

Browse files
committed
* fix translation of 0x98 to \u02DC
* fix surrogate-character-reference parse error handling * fix consecutive ampersands before namede entity parsing
1 parent e185589 commit 94a671f

4 files changed

Lines changed: 52 additions & 3 deletions

File tree

src/main/java/org/htmlunit/cyberneko/HTMLScanner.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1469,6 +1469,15 @@ protected int scanEntityRef(final XMLString str, final XMLString plainValue, fin
14691469
}
14701470
str.append((char) nextChar);
14711471

1472+
if ('&' == nextChar) {
1473+
fCurrentEntity.rewind(1);
1474+
if (plainValue != null) {
1475+
plainValue.append('&');
1476+
}
1477+
str.clearAndAppend('&');
1478+
return returnEntityRefString(str, content);
1479+
}
1480+
14721481
if ('#' == nextChar) {
14731482
fUnicodeEntitiesParser.reset();
14741483

src/main/java/org/htmlunit/cyberneko/HTMLUnicodeEntitiesParser.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ public void setMatchFromCode() {
8383
// a surrogate-character-reference parse error. Set the character reference code to 0xFFFD
8484
if (Character.isSurrogate((char) code_)) {
8585
match_ = "\uFFFD";
86+
matchLength_ = consumedCount_;
8687
return;
8788
}
8889

@@ -196,7 +197,7 @@ public void setMatchFromCode() {
196197
return;
197198

198199
case 0x98:
199-
match_ = "\u20DC";
200+
match_ = "\u02DC";
200201
matchLength_ = consumedCount_;
201202
return;
202203

src/test/java/org/htmlunit/cyberneko/GeneralTest.java

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -315,4 +315,40 @@ public void parseInputSourceIANAEncoding() throws Exception {
315315
+ ")html";
316316
assertEquals(expected.trim(), out.toString().trim());
317317
}
318+
319+
@Test
320+
public void textContentConsecutiveAmpersandsBeforeNamedEntity() throws Exception {
321+
final String expected = "(html" + NL
322+
+ "(head" + NL
323+
+ ")head" + NL
324+
+ "(body" + NL
325+
+ "\"FOO&&&>BAR" + NL
326+
+ ")body" + NL
327+
+ ")html";
328+
doTest("FOO&&&>BAR", null, expected);
329+
}
330+
331+
@Test
332+
public void textContentSurrogateNumericReference() throws Exception {
333+
final String expected = "(html" + NL
334+
+ "(head" + NL
335+
+ ")head" + NL
336+
+ "(body" + NL
337+
+ "\"FOO\uFFFDZOO" + NL
338+
+ ")body" + NL
339+
+ ")html";
340+
doTest("FOO�ZOO", null, expected);
341+
}
342+
343+
@Test
344+
public void textContentWindows1252ControlMapping() throws Exception {
345+
final String expected = "(html" + NL
346+
+ "(head" + NL
347+
+ ")head" + NL
348+
+ "(body" + NL
349+
+ "\"FOO\u02DCZOO" + NL
350+
+ ")body" + NL
351+
+ ")html";
352+
doTest("FOO˜ZOO", null, expected);
353+
}
318354
}

src/test/java/org/htmlunit/cyberneko/HTMLUnicodeEntitiesParserTest.java

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -148,9 +148,12 @@ public void parseSurrogate() {
148148
while (parser.parseNumeric(input.charAt(i))) {
149149
i++;
150150
}
151-
151+
// A surrogate code point (�) is invalid in HTML, so the parser emits
152+
// the replacement character U+FFFD. Because the semicolon-terminated reference
153+
// was fully consumed, matchLength equals consumedCount and rewindCount is 0 —
154+
// no characters need to be put back into the input stream.
152155
assertEquals("\uFFFD", parser.getMatch());
153-
assertEquals(6, parser.getRewindCount());
156+
assertEquals(0, parser.getRewindCount());
154157
}
155158

156159
@Test

0 commit comments

Comments
 (0)