diff --git a/src/main/java/org/htmlunit/cyberneko/HTMLScanner.java b/src/main/java/org/htmlunit/cyberneko/HTMLScanner.java
index a92a050d..4e3d0b69 100644
--- a/src/main/java/org/htmlunit/cyberneko/HTMLScanner.java
+++ b/src/main/java/org/htmlunit/cyberneko/HTMLScanner.java
@@ -1467,6 +1467,16 @@ protected int scanEntityRef(final XMLString str, final XMLString plainValue, fin
}
return returnEntityRefString(str, content);
}
+
+ if ('&' == nextChar) {
+ fCurrentEntity.rewind(1);
+ if (plainValue != null) {
+ plainValue.append('&');
+ }
+ str.clearAndAppend('&');
+ return returnEntityRefString(str, content);
+ }
+
str.append((char) nextChar);
if ('#' == nextChar) {
diff --git a/src/main/java/org/htmlunit/cyberneko/HTMLUnicodeEntitiesParser.java b/src/main/java/org/htmlunit/cyberneko/HTMLUnicodeEntitiesParser.java
index b912367c..391a2c1b 100644
--- a/src/main/java/org/htmlunit/cyberneko/HTMLUnicodeEntitiesParser.java
+++ b/src/main/java/org/htmlunit/cyberneko/HTMLUnicodeEntitiesParser.java
@@ -83,6 +83,7 @@ public void setMatchFromCode() {
// a surrogate-character-reference parse error. Set the character reference code to 0xFFFD
if (Character.isSurrogate((char) code_)) {
match_ = "\uFFFD";
+ matchLength_ = consumedCount_;
return;
}
@@ -196,7 +197,7 @@ public void setMatchFromCode() {
return;
case 0x98:
- match_ = "\u20DC";
+ match_ = "\u02DC";
matchLength_ = consumedCount_;
return;
diff --git a/src/test/java/org/htmlunit/cyberneko/GeneralTest.java b/src/test/java/org/htmlunit/cyberneko/GeneralTest.java
index d510396c..a47c65b9 100644
--- a/src/test/java/org/htmlunit/cyberneko/GeneralTest.java
+++ b/src/test/java/org/htmlunit/cyberneko/GeneralTest.java
@@ -315,4 +315,40 @@ public void parseInputSourceIANAEncoding() throws Exception {
+ ")html";
assertEquals(expected.trim(), out.toString().trim());
}
+
+ @Test
+ public void textContentConsecutiveAmpersandsBeforeNamedEntity() throws Exception {
+ final String expected = "(html" + NL
+ + "(head" + NL
+ + ")head" + NL
+ + "(body" + NL
+ + "\"FOO&&&>BAR" + NL
+ + ")body" + NL
+ + ")html";
+ doTest("FOO&&&>BAR", null, expected);
+ }
+
+ @Test
+ public void textContentSurrogateNumericReference() throws Exception {
+ final String expected = "(html" + NL
+ + "(head" + NL
+ + ")head" + NL
+ + "(body" + NL
+ + "\"FOO\uFFFDZOO" + NL
+ + ")body" + NL
+ + ")html";
+ doTest("FOOZOO", null, expected);
+ }
+
+ @Test
+ public void textContentWindows1252ControlMapping() throws Exception {
+ final String expected = "(html" + NL
+ + "(head" + NL
+ + ")head" + NL
+ + "(body" + NL
+ + "\"FOO\u02DCZOO" + NL
+ + ")body" + NL
+ + ")html";
+ doTest("FOOZOO", null, expected);
+ }
}
diff --git a/src/test/java/org/htmlunit/cyberneko/HTMLUnicodeEntitiesParserTest.java b/src/test/java/org/htmlunit/cyberneko/HTMLUnicodeEntitiesParserTest.java
index c7d191c6..ae321246 100644
--- a/src/test/java/org/htmlunit/cyberneko/HTMLUnicodeEntitiesParserTest.java
+++ b/src/test/java/org/htmlunit/cyberneko/HTMLUnicodeEntitiesParserTest.java
@@ -150,7 +150,21 @@ public void parseSurrogate() {
}
assertEquals("\uFFFD", parser.getMatch());
- assertEquals(6, parser.getRewindCount());
+ assertEquals(0, parser.getRewindCount());
+ }
+
+ @Test
+ public void parseWindows1252SmallTilde() {
+ final HTMLUnicodeEntitiesParser parser = new HTMLUnicodeEntitiesParser();
+
+ final String input = "x98;";
+ int i = 0;
+ while (parser.parseNumeric(input.charAt(i))) {
+ i++;
+ }
+
+ assertEquals("\u02DC", parser.getMatch());
+ assertEquals(0, parser.getRewindCount());
}
@Test