From 81869db4325b63d08b4a89ae29952ac6602a7457 Mon Sep 17 00:00:00 2001
From: Taksh <takshkothari09@gmail.com>
Date: Mon, 20 Apr 2026 16:49:20 +0530
Subject: [PATCH] Make escape_html_characters' comment/CDATA stripping safe

escape_html_characters uses re.sub with r"<!--.*-->" and
r"<!\[CDATA\[.*\]\]>" to strip HTML comments and CDATA sections before
ElasticSearch indexing. Two issues with those patterns:

1. `.*` is greedy, so a string containing two comments like
       "<!-- a --> keep this <!-- b -->"
   collapses to "" - everything between the first "<!--" and the last
   "-->" is eaten, including legitimate indexable text.
2. The default `.` does not match newlines, so any multi-line comment
   or CDATA block
       <!--
         notes
       -->
   slips through completely and ends up in the search index.

Switch to non-greedy `.*?` and pass `flags=re.DOTALL` so each pattern
matches exactly one comment/CDATA span, including multi-line ones,
without swallowing surrounding text.
---
 xmodule/util/misc.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/xmodule/util/misc.py b/xmodule/util/misc.py
index 3aae7ef6f30c..8634d41aa05c 100644
--- a/xmodule/util/misc.py
+++ b/xmodule/util/misc.py
@@ -48,19 +48,21 @@ def escape_html_characters(content):
 
     # Removing HTML comments
     return re.sub(
-        r"<!--.*-->",
+        r"<!--.*?-->",
         "",
         # Removing HTML CDATA
         re.sub(
-            r"<!\[CDATA\[.*\]\]>",
+            r"<!\[CDATA\[.*?\]\]>",
             "",
             # Removing HTML-encoded non-breaking space characters
             re.sub(
                 r"(\s|&nbsp;|//)+",
                 " ",
                 html_to_text(content)
-            )
-        )
+            ),
+            flags=re.DOTALL
+        ),
+        flags=re.DOTALL
     )