diff --git a/src/main/java/org/htmlunit/cyberneko/HTMLScanner.java b/src/main/java/org/htmlunit/cyberneko/HTMLScanner.java index a92a050d..4e3d0b69 100644 --- a/src/main/java/org/htmlunit/cyberneko/HTMLScanner.java +++ b/src/main/java/org/htmlunit/cyberneko/HTMLScanner.java @@ -1467,6 +1467,16 @@ protected int scanEntityRef(final XMLString str, final XMLString plainValue, fin } return returnEntityRefString(str, content); } + + if ('&' == nextChar) { + fCurrentEntity.rewind(1); + if (plainValue != null) { + plainValue.append('&'); + } + str.clearAndAppend('&'); + return returnEntityRefString(str, content); + } + str.append((char) nextChar); if ('#' == nextChar) { diff --git a/src/main/java/org/htmlunit/cyberneko/HTMLUnicodeEntitiesParser.java b/src/main/java/org/htmlunit/cyberneko/HTMLUnicodeEntitiesParser.java index b912367c..391a2c1b 100644 --- a/src/main/java/org/htmlunit/cyberneko/HTMLUnicodeEntitiesParser.java +++ b/src/main/java/org/htmlunit/cyberneko/HTMLUnicodeEntitiesParser.java @@ -83,6 +83,7 @@ public void setMatchFromCode() { // a surrogate-character-reference parse error. Set the character reference code to 0xFFFD if (Character.isSurrogate((char) code_)) { match_ = "\uFFFD"; + matchLength_ = consumedCount_; return; } @@ -196,7 +197,7 @@ public void setMatchFromCode() { return; case 0x98: - match_ = "\u20DC"; + match_ = "\u02DC"; matchLength_ = consumedCount_; return; diff --git a/src/test/java/org/htmlunit/cyberneko/GeneralTest.java b/src/test/java/org/htmlunit/cyberneko/GeneralTest.java index d510396c..a47c65b9 100644 --- a/src/test/java/org/htmlunit/cyberneko/GeneralTest.java +++ b/src/test/java/org/htmlunit/cyberneko/GeneralTest.java @@ -315,4 +315,40 @@ public void parseInputSourceIANAEncoding() throws Exception { + ")html"; assertEquals(expected.trim(), out.toString().trim()); } + + @Test + public void textContentConsecutiveAmpersandsBeforeNamedEntity() throws Exception { + final String expected = "(html" + NL + + "(head" + NL + + ")head" + NL + + "(body" + NL + + "\"FOO&&&>BAR" + NL + + ")body" + NL + + ")html"; + doTest("FOO&&&>BAR", null, expected); + } + + @Test + public void textContentSurrogateNumericReference() throws Exception { + final String expected = "(html" + NL + + "(head" + NL + + ")head" + NL + + "(body" + NL + + "\"FOO\uFFFDZOO" + NL + + ")body" + NL + + ")html"; + doTest("FOO�ZOO", null, expected); + } + + @Test + public void textContentWindows1252ControlMapping() throws Exception { + final String expected = "(html" + NL + + "(head" + NL + + ")head" + NL + + "(body" + NL + + "\"FOO\u02DCZOO" + NL + + ")body" + NL + + ")html"; + doTest("FOO˜ZOO", null, expected); + } } diff --git a/src/test/java/org/htmlunit/cyberneko/HTMLUnicodeEntitiesParserTest.java b/src/test/java/org/htmlunit/cyberneko/HTMLUnicodeEntitiesParserTest.java index c7d191c6..ae321246 100644 --- a/src/test/java/org/htmlunit/cyberneko/HTMLUnicodeEntitiesParserTest.java +++ b/src/test/java/org/htmlunit/cyberneko/HTMLUnicodeEntitiesParserTest.java @@ -150,7 +150,21 @@ public void parseSurrogate() { } assertEquals("\uFFFD", parser.getMatch()); - assertEquals(6, parser.getRewindCount()); + assertEquals(0, parser.getRewindCount()); + } + + @Test + public void parseWindows1252SmallTilde() { + final HTMLUnicodeEntitiesParser parser = new HTMLUnicodeEntitiesParser(); + + final String input = "x98;"; + int i = 0; + while (parser.parseNumeric(input.charAt(i))) { + i++; + } + + assertEquals("\u02DC", parser.getMatch()); + assertEquals(0, parser.getRewindCount()); } @Test