From 81869db4325b63d08b4a89ae29952ac6602a7457 Mon Sep 17 00:00:00 2001 From: Taksh Date: Mon, 20 Apr 2026 16:49:20 +0530 Subject: [PATCH] Make escape_html_characters' comment/CDATA stripping safe escape_html_characters uses re.sub with r"" and r"" to strip HTML comments and CDATA sections before ElasticSearch indexing. Two issues with those patterns: 1. `.*` is greedy, so a string containing two comments like " keep this " collapses to "" - everything between the first "" is eaten, including legitimate indexable text. 2. The default `.` does not match newlines, so any multi-line comment or CDATA block slips through completely and ends up in the search index. Switch to non-greedy `.*?` and pass `flags=re.DOTALL` so each pattern matches exactly one comment/CDATA span, including multi-line ones, without swallowing surrounding text. --- xmodule/util/misc.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/xmodule/util/misc.py b/xmodule/util/misc.py index 3aae7ef6f30c..8634d41aa05c 100644 --- a/xmodule/util/misc.py +++ b/xmodule/util/misc.py @@ -48,19 +48,21 @@ def escape_html_characters(content): # Removing HTML comments return re.sub( - r"", + r"", "", # Removing HTML CDATA re.sub( - r"", + r"", "", # Removing HTML-encoded non-breaking space characters re.sub( r"(\s| |//)+", " ", html_to_text(content) - ) - ) + ), + flags=re.DOTALL + ), + flags=re.DOTALL )