From fc39723bbc70a78418bb127e11da9a7b37d48c23 Mon Sep 17 00:00:00 2001
From: Raman Marozau <raman@worktif.com>
Date: Thu, 9 Apr 2026 01:50:15 +0200
Subject: [PATCH 1/2] fix(fetch): add fallback extraction for
 readability-stripped content

- Add fallback mechanism when readability extracts minimal content (e.g., SSR hydration divs)
- Implement minimum content length threshold (1% of original HTML) to detect over-aggressive stripping
- Fall back to extraction without readability, then raw HTML markdownify if threshold not met
- Update extract_content_from_html to gracefully handle empty content instead of returning error
- Add comprehensive test suite for fallback scenarios including SSR patterns and hidden content
- Update test_empty_content_returns_error to test graceful handling instead of error message
---
 src/fetch/src/mcp_server_fetch/server.py | 42 +++++++++--
 src/fetch/tests/test_server.py           | 95 ++++++++++++++++++++++--
 2 files changed, 126 insertions(+), 11 deletions(-)
diff --git a/src/fetch/src/mcp_server_fetch/server.py b/src/fetch/src/mcp_server_fetch/server.py
index b42c7b1f6b..dc32afa5f9 100644
--- a/src/fetch/src/mcp_server_fetch/server.py
+++ b/src/fetch/src/mcp_server_fetch/server.py
@@ -27,6 +27,10 @@
 def extract_content_from_html(html: str) -> str:
     """Extract and convert HTML content to Markdown format.
 
+    Uses readability for content extraction with fallback mechanisms for cases
+    where readability strips too much content (e.g. sites using hidden divs
+    for SSR hydration).
+
     Args:
         html: Raw HTML content to process
 
@@ -36,12 +40,38 @@ def extract_content_from_html(html: str) -> str:
     ret = readabilipy.simple_json.simple_json_from_html_string(
         html, use_readability=True
     )
-    if not ret["content"]:
-        return "<error>Page failed to be simplified from HTML</error>"
-    content = markdownify.markdownify(
-        ret["content"],
-        heading_style=markdownify.ATX,
-    )
+    content_html = ret.get("content", "")
+    if content_html:
+        content = markdownify.markdownify(
+            content_html,
+            heading_style=markdownify.ATX,
+        )
+    else:
+        content = ""
+
+    # If readability extracted very little text compared to the original HTML,
+    # it likely stripped meaningful content (e.g. hidden SSR hydration divs).
+    # Fall back to extraction without readability, then raw markdownify.
+    min_length = max(1, len(html) // 100)
+    content_text = content.strip()
+    if len(content_text) < min_length:
+        ret = readabilipy.simple_json.simple_json_from_html_string(
+            html, use_readability=False
+        )
+        if ret["content"]:
+            content = markdownify.markdownify(
+                ret["content"],
+                heading_style=markdownify.ATX,
+            )
+            if len(content.strip()) >= min_length:
+                return content
+
+        # Last resort: convert the raw HTML directly
+        content = markdownify.markdownify(
+            html,
+            heading_style=markdownify.ATX,
+        )
+
     return content
 
 
diff --git a/src/fetch/tests/test_server.py b/src/fetch/tests/test_server.py
index 96c1cb38c7..9f23e82703 100644
--- a/src/fetch/tests/test_server.py
+++ b/src/fetch/tests/test_server.py
@@ -81,11 +81,11 @@ def test_html_with_links(self):
         result = extract_content_from_html(html)
         assert "Example" in result
 
-    def test_empty_content_returns_error(self):
-        """Test that empty/invalid HTML returns error message."""
-        html = ""
-        result = extract_content_from_html(html)
-        assert "<error>" in result
+    def test_empty_content(self):
+        """Test that empty HTML is handled gracefully via fallback."""
+        result = extract_content_from_html("")
+        # Empty input produces empty output after fallback
+        assert "<error>" not in result
 
 
 class TestCheckMayAutonomouslyFetchUrl:
@@ -324,3 +324,88 @@ async def test_fetch_with_proxy(self):
 
             # Verify AsyncClient was called with proxy
             mock_client_class.assert_called_once_with(proxy="http://proxy.example.com:8080")
+
+
+class TestExtractContentFromHtmlFallback:
+    """Tests for extract_content_from_html readability fallback."""
+
+    def test_normal_html_no_fallback(self):
+        """Test that normal HTML with visible content works without fallback."""
+        html = """
+        <html>
+        <head><title>Normal Page</title></head>
+        <body>
+            <article>
+                <h1>Welcome</h1>
+                <p>This is a normal page with plenty of visible content that
+                readability should have no trouble extracting properly.</p>
+                <p>Here is another paragraph with more content to ensure
+                we have enough text for the extraction to work well.</p>
+            </article>
+        </body>
+        </html>
+        """
+        result = extract_content_from_html(html)
+        assert "normal page" in result.lower()
+        assert "<error>" not in result
+
+    def test_hidden_ssr_content_triggers_fallback(self):
+        """Test that hidden SSR content triggers fallback extraction."""
+        visible_shell = "<p>Loading...</p>"
+        hidden_content = "<p>{}</p>".format(" ".join(["word"] * 500))
+        html = """
+        <html>
+        <body>
+            <div id="shell">{}</div>
+            <div style="visibility:hidden" id="ssr-data">{}</div>
+        </body>
+        </html>
+        """.format(visible_shell, hidden_content)
+        result = extract_content_from_html(html)
+        # Fallback should recover the hidden content
+        assert len(result.strip()) > len(html) // 100
+
+    def test_readability_empty_content_triggers_fallback(self):
+        """Test that readability returning empty content triggers fallback."""
+        content_text = " ".join(["meaningful"] * 200)
+        html = """
+        <html>
+        <body>
+            <div><p>{}</p></div>
+        </body>
+        </html>
+        """.format(content_text)
+        with patch("mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string") as mock_readability:
+            # First call (use_readability=True) returns empty content
+            # Second call (use_readability=False) returns empty content too
+            mock_readability.return_value = {"content": None}
+            result = extract_content_from_html(html)
+        # Fallback to raw markdownify should recover content
+        assert "meaningful" in result
+        assert "<error>" not in result
+
+    def test_small_visible_shell_large_hidden_ssr(self):
+        """Test realistic SSR pattern: small visible loading shell + large hidden content."""
+        ssr_paragraphs = "\n".join(
+            "<p>Article paragraph {} with enough text to be meaningful content.</p>".format(i)
+            for i in range(50)
+        )
+        html = """
+        <html>
+        <head><title>SSR App</title></head>
+        <body>
+            <div id="app">
+                <div class="spinner">Loading application...</div>
+            </div>
+            <div style="position:absolute;top:-9999px" id="__ssr_data__">
+                <article>
+                    <h1>Full Article Title</h1>
+                    {}
+                </article>
+            </div>
+        </body>
+        </html>
+        """.format(ssr_paragraphs)
+        result = extract_content_from_html(html)
+        # Should not return just "Loading application..." — fallback should recover content
+        assert len(result.strip()) > len(html) // 100

From 98352637ab989a5e2123a5fca179a474c0ebb2c1 Mon Sep 17 00:00:00 2001
From: Raman Marozau <raman@worktif.com>
Date: Tue, 14 Apr 2026 14:18:51 +0200
Subject: [PATCH 2/2] test(fetch): expand extract_content_from_html fallback
 test coverage

- Replace single generic test with 11 comprehensive test cases covering fallback scenarios
- Add tests for readability returning sufficient content without fallback
- Add tests for readability stripping content and triggering fallback to non-readability mode
- Add tests for both readability modes failing and falling back to raw markdownify
- Add tests for None content and missing 'content' key edge cases
- Add tests for 1% threshold calculation and boundary conditions
- Add tests for whitespace-only content triggering fallback
- Add tests for stage 2 (no-readability) failures falling back to raw extraction
- Improve test clarity with descriptive names and docstrings explaining each scenario
- Ensure comprehensive coverage of the three-stage fallback extraction pipeline
---
 src/fetch/tests/test_server.py | 223 +++++++++++++++++++++++++++------
 1 file changed, 185 insertions(+), 38 deletions(-)

diff --git a/src/fetch/tests/test_server.py b/src/fetch/tests/test_server.py
index 9f23e82703..ff43779d43 100644
--- a/src/fetch/tests/test_server.py
+++ b/src/fetch/tests/test_server.py
@@ -329,24 +329,149 @@ async def test_fetch_with_proxy(self):
 class TestExtractContentFromHtmlFallback:
     """Tests for extract_content_from_html readability fallback."""
 
-    def test_normal_html_no_fallback(self):
-        """Test that normal HTML with visible content works without fallback."""
-        html = """
-        <html>
-        <head><title>Normal Page</title></head>
-        <body>
-            <article>
-                <h1>Welcome</h1>
-                <p>This is a normal page with plenty of visible content that
-                readability should have no trouble extracting properly.</p>
-                <p>Here is another paragraph with more content to ensure
-                we have enough text for the extraction to work well.</p>
-            </article>
-        </body>
-        </html>
-        """
+    def test_readability_sufficient_content_no_fallback(self):
+        """When Readability returns enough content, no fallback is triggered."""
+        html = "<html><body>" + "<p>word </p>" * 200 + "</body></html>"
+        readability_content = "<div>" + "<p>word </p>" * 200 + "</div>"
+
+        with patch("mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string") as mock_readability:
+            mock_readability.return_value = {"content": readability_content}
+            result = extract_content_from_html(html)
+            assert mock_readability.call_count == 1
+            assert "word" in result
+
+    def test_readability_strips_content_falls_back_to_no_readability(self):
+        """When Readability returns too little, falls back to non-Readability extraction."""
+        html = "<html><body>" + "<p>content </p>" * 500 + "</body></html>"
+
+        def mock_simple_json(h, use_readability=True):
+            if use_readability:
+                return {"content": "<div>Loading...</div>"}
+            else:
+                return {"content": "<div>" + "<p>content </p>" * 500 + "</div>"}
+
+        with patch("mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string", side_effect=mock_simple_json):
+            result = extract_content_from_html(html)
+            assert "content" in result
+            assert len(result.strip()) > 100
+
+    def test_both_readability_modes_fail_falls_back_to_markdownify(self):
+        """When both readabilipy modes return too little, falls back to raw markdownify."""
+        html = "<html><body>" + "<p>important data </p>" * 300 + "</body></html>"
+
+        with patch("mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string") as mock_readability:
+            mock_readability.return_value = {"content": ""}
+            result = extract_content_from_html(html)
+            assert "important data" in result
+            assert mock_readability.call_count == 2
+
+    def test_readability_none_content_triggers_fallback(self):
+        """When Readability returns None content, fallback is triggered."""
+        html = "<html><body>" + "<p>real content </p>" * 200 + "</body></html>"
+
+        call_count = [0]
+
+        def mock_simple_json(h, use_readability=True):
+            call_count[0] += 1
+            if call_count[0] == 1:
+                return {"content": None}
+            else:
+                return {"content": "<div>" + "<p>real content </p>" * 200 + "</div>"}
+
+        with patch("mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string", side_effect=mock_simple_json):
+            result = extract_content_from_html(html)
+            assert "real content" in result
+
+    def test_readability_missing_content_key_triggers_fallback(self):
+        """When Readability returns dict without 'content' key, fallback is triggered."""
+        html = "<html><body>" + "<p>data </p>" * 200 + "</body></html>"
+
+        call_count = [0]
+
+        def mock_simple_json(h, use_readability=True):
+            call_count[0] += 1
+            if call_count[0] == 1:
+                return {}  # no "content" key at all
+            else:
+                return {"content": "<div>" + "<p>data </p>" * 200 + "</div>"}
+
+        with patch("mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string", side_effect=mock_simple_json):
+            result = extract_content_from_html(html)
+            assert "data" in result
+
+    def test_threshold_is_one_percent_of_html(self):
+        """The fallback threshold is 1% of the input HTML length."""
+        padding = "x" * 9000
+        html = f'<html><body><div style="visibility:hidden">{padding}</div><p>tiny</p></body></html>'
+
+        def mock_simple_json(h, use_readability=True):
+            if use_readability:
+                return {"content": "<p>tiny</p>"}
+            else:
+                return {"content": f"<div>{padding}</div><p>tiny</p>"}
+
+        with patch("mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string", side_effect=mock_simple_json):
+            result = extract_content_from_html(html)
+            # "tiny" (4 chars) < 1% of ~9100 chars → fallback triggered
+            assert len(result.strip()) > 50
+
+    def test_content_at_threshold_boundary_no_fallback(self):
+        """Content exactly at the 1% threshold does not trigger fallback."""
+        # Build HTML of known size, readability returns content exactly at threshold
+        filler = "a" * 1000
+        html = f"<html><body><p>{filler}</p></body></html>"
+        # 1% of ~1030 chars ≈ 10 chars; return content well above that
+        readability_content = f"<p>{filler}</p>"
+
+        with patch("mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string") as mock_readability:
+            mock_readability.return_value = {"content": readability_content}
+            result = extract_content_from_html(html)
+            assert mock_readability.call_count == 1
+            assert "a" in result
+
+    def test_whitespace_only_readability_output_triggers_fallback(self):
+        """Readability returning whitespace-only content triggers fallback."""
+        html = "<html><body>" + "<p>payload </p>" * 300 + "</body></html>"
+
+        def mock_simple_json(h, use_readability=True):
+            if use_readability:
+                return {"content": "<div>   \n\t  </div>"}
+            else:
+                return {"content": "<div>" + "<p>payload </p>" * 300 + "</div>"}
+
+        with patch("mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string", side_effect=mock_simple_json):
+            result = extract_content_from_html(html)
+            assert "payload" in result
+
+    def test_no_readability_also_returns_too_little_falls_to_raw(self):
+        """When stage 2 (no-readability) also returns too little, raw markdownify is used."""
+        html = "<html><body>" + "<p>deep content </p>" * 400 + "</body></html>"
+
+        with patch("mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string") as mock_readability:
+            # Both calls return minimal content
+            mock_readability.return_value = {"content": "<p>x</p>"}
+            result = extract_content_from_html(html)
+            assert "deep content" in result
+            assert mock_readability.call_count == 2
+
+    def test_no_readability_returns_none_falls_to_raw(self):
+        """When stage 2 (no-readability) returns None content, raw markdownify is used."""
+        html = "<html><body>" + "<p>fallback target </p>" * 200 + "</body></html>"
+
+        with patch("mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string") as mock_readability:
+            mock_readability.return_value = {"content": None}
+            result = extract_content_from_html(html)
+            assert "fallback target" in result
+
+    def test_empty_html_returns_empty_string(self):
+        """Empty HTML input produces empty output, no error tags."""
+        result = extract_content_from_html("")
+        assert "<error>" not in result
+
+    def test_small_html_min_length_is_one(self):
+        """For very small HTML, min_length floors at 1 so fallback still works."""
+        html = "<p>Hi</p>"
         result = extract_content_from_html(html)
-        assert "normal page" in result.lower()
         assert "<error>" not in result
 
     def test_hidden_ssr_content_triggers_fallback(self):
@@ -362,28 +487,8 @@ def test_hidden_ssr_content_triggers_fallback(self):
         </html>
         """.format(visible_shell, hidden_content)
         result = extract_content_from_html(html)
-        # Fallback should recover the hidden content
         assert len(result.strip()) > len(html) // 100
 
-    def test_readability_empty_content_triggers_fallback(self):
-        """Test that readability returning empty content triggers fallback."""
-        content_text = " ".join(["meaningful"] * 200)
-        html = """
-        <html>
-        <body>
-            <div><p>{}</p></div>
-        </body>
-        </html>
-        """.format(content_text)
-        with patch("mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string") as mock_readability:
-            # First call (use_readability=True) returns empty content
-            # Second call (use_readability=False) returns empty content too
-            mock_readability.return_value = {"content": None}
-            result = extract_content_from_html(html)
-        # Fallback to raw markdownify should recover content
-        assert "meaningful" in result
-        assert "<error>" not in result
-
     def test_small_visible_shell_large_hidden_ssr(self):
         """Test realistic SSR pattern: small visible loading shell + large hidden content."""
         ssr_paragraphs = "\n".join(
@@ -407,5 +512,47 @@ def test_small_visible_shell_large_hidden_ssr(self):
         </html>
         """.format(ssr_paragraphs)
         result = extract_content_from_html(html)
-        # Should not return just "Loading application..." — fallback should recover content
         assert len(result.strip()) > len(html) // 100
+
+    def test_opacity_zero_hidden_content(self):
+        """Content hidden via opacity:0 should be recovered by fallback."""
+        hidden_text = " ".join(["secret"] * 300)
+        html = """
+        <html>
+        <body>
+            <div>Visible shell</div>
+            <div style="opacity:0"><p>{}</p></div>
+        </body>
+        </html>
+        """.format(hidden_text)
+        result = extract_content_from_html(html)
+        assert len(result.strip()) > len(html) // 100
+
+    def test_result_never_contains_error_tags(self):
+        """No code path in the updated function should produce <error> tags."""
+        cases = [
+            "",
+            "<html></html>",
+            "<html><body></body></html>",
+            "<p>short</p>",
+        ]
+        for html in cases:
+            result = extract_content_from_html(html)
+            assert "<error>" not in result, f"Got <error> for input: {html!r}"
+
+    def test_all_three_stages_called_when_needed(self):
+        """Verify the full three-stage cascade: readability → no-readability → raw markdownify."""
+        html = "<html><body>" + "<p>cascade </p>" * 400 + "</body></html>"
+
+        calls = []
+
+        def mock_simple_json(h, use_readability=True):
+            calls.append(use_readability)
+            return {"content": ""}
+
+        with patch("mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string", side_effect=mock_simple_json):
+            result = extract_content_from_html(html)
+            # Stage 1: use_readability=True, Stage 2: use_readability=False
+            assert calls == [True, False]
+            # Stage 3: raw markdownify recovers content
+            assert "cascade" in result